Yesterday I want to download materials in pdfs from Andrew Moore. while I hate to click the link and save all pdf one by one. Thus I write a script to do that job for me.
Here are the idea: * Get url of all topic * Get url of each pdf * Download it
In []:
import requests
#from pattern import web
import re
import os
import BeautifulSoup
import urllib2
#!~/anaconda/bin/pip install wget
#if you did not install some package, install it!
import wget
#using wget to get pdf
def findlinksnot(url,spec):
#Find the url of all topics list on the webpage:http://www.autonlab.org/tutorials/list.html
url=url
parent_dir=url[:url.rfind("/")]+"/"
html = requests.get(url).text
soup = BeautifulSoup.BeautifulSoup(html)
links=[parent_dir+a['href'] for a in soup.findAll('a') if spec not in a['href']]
return links
def findlink(url,spec):
#Find all pdfs in page:http://www.autonlab.org/tutorials/infogain.html
url=url
parent_dir=url[:url.rfind("/")]+"/"
html = requests.get(url).text
soup = BeautifulSoup.BeautifulSoup(html)
links=[parent_dir+a['href'] for a in soup.findAll('a') if spec in a['href']]
return links
def getpdf(links,newpath=os.getcwd()):
name=wget.download(links,out=newpath)
print "%s\n downloaded in %s\n",name, newpath
def main(url,newpath=os.getcwd()):
#if newpath not set it will creat one
if not os.path.exists(newpath): os.makedirs(newpath)
topics=findlinksnot(url,'http')
print topics
for topic in topics:
links=findlink(topic,'.pdf')
for link in links:
getpdf(link,newpath=newpath)
#if __name__ == '__main__':
#main()
main('http://www.autonlab.org/tutorials/list.html')
Following work: * Figure out how to write a script that can work by command line arguement * Learn more about scrapy and do the job more automatically.
In []:
没有评论:
发表评论