Script for searching and web crawling Springer Books

So, some of you may have heard about the free e-books frenzy at Springer (unfortunately, not working any more ). I jumped on the train of people trying to find the goodies on the website and ended up developing some tools to help me with that. Even after the freebies ended I kept on improving this tool because it is useful also in my day-to-day job of searching for bibliography and storing those PDF’s. So this script can basically navigate through a search result (passed to the script as a link to the search result) and download all books available (even if it consist of many pages results). This is also useful for downloading all books from a series (for example, springer series in statistics). In the future I intend to update the script including download of chapters of books and research articles. This script is specially useful if your university have access to Springer Books. (full code in https://gist.github.com/zehsilva/d890837cdfd4a20ee8a5#file-download-books-from-rss-springer-py)


#!/usr/bin/python
""" @copyleft Eliezer Silva (djosacv@gmail.com)
This script was developed mostly for recreative and educational purpose, so you use it on your own risk and as it is.
With this script you can crawl in springer book search results and download all the book listed in the page. This version also
checks if the result set includes many pages and navigate through those pages.
There's two ways of using it:
1) Link to a single book ulr in springer page:
python download-books-from-rss-springer.py 'http://rd.springer.com/book/10.1007/978-3-662-07003-1'
2) Link to a search result with many books:
python download-books-from-rss-springer.py -s 'http://rd.springer.com/search?facet-series=%223214%22&facet-content-type=%22Book%22&showAll=false'
"""
import feedparser
import urllib
import re
from HTMLParser import HTMLParser
from urlparse import urlparse
import urllib2
import sys
def downloader(url,filename=None):
file_name = filename
if(filename==None):
file_name = url.split('/')[1]
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes to %s" % ( file_size, file_name)
file_size_dl = 0
block_sz = 8192
parcels = file_size/30
disp = False
c=1
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
#print status,
if (file_size_dl > c*parcels):
c+=1
disp=True
if(disp == True):
print "_"+r"%3.2f%%" % (file_size_dl * 100. / file_size),
disp=False
print ""
f.close()
class myhtmlparser(HTMLParser):
def __init__(self,link):
self.reset()
self.inLink=False
self.domain=self.get_domain(link)
self.pdflink=""
self.bookname=None
self.is_searching=True
self.inyear=False
self.year=""
self.author_list=[]
self.inauthorlist=False
self.inauth=False
def handle_starttag(self, tag, attrs):
if(tag == 'span' and ('class','copyright-year') in attrs):
self.inyear=True
if(tag == 'a' and self.find_link(attrs) and self.is_searching):
self.inLink=True
if(tag == 'div' and (('class','author-list') in attrs or ('class','editor-list') in attrs)):
self.inauthorlist=True
if(tag == 'a' and ('itemprop','name') in attrs and self.inauthorlist):
self.inauth=True
def handle_endtag(self,tag):
if(tag == 'div' and self.inauthorlist):
self.inauthorlist=False
def handle_data(self, data):
if(self.inLink):
print data
print "pdf link "+self.pdflink
year = ""
if(self.year!=""):
year="["+self.year+"]"
authn=",".join(self.author_list)
if(authn!=""):
authn="["+authn+"]"
self.is_searching=False
self.inLink=False
if(self.inyear):
self.year=data.replace("\n","").replace(" ","")
self.inyear=False
if(self.inauth):
self.author_list.append(data.replace("\n","").replace(" "," ").split()[1])
self.inauth=False
def doDownload(self):
year = ""
if(self.year!=""):
year="["+self.year+"]"
authn=",".join(self.author_list)
if(authn!=""):
authn="["+authn+"]"
downloader(self.pdflink,year+authn+self.bookname+".pdf")
def find_link(self,attrs):
attrnames=zip(*attrs)[0]
vals=zip(*attrs)[1]
ret=False
bookn=""
if('doi' in attrnames and ('contenttype', 'Book') in attrs and 'href' in attrnames):
for pair_val in attrs:
if(pair_val[0]=='href' and pair_val[1].endswith('pdf')):
self.pdflink = self.domain+pair_val[1]
ret = True
if(pair_val[0]=='publication'):
bookn=pair_val[1].replace('/','.').split("|")[1]+"."+pair_val[1].replace('/','.').split("|")[0]
if(ret):
self.bookname=bookn
return ret
def get_domain(self,url):
parsed_uri = urlparse( url )
domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
return domain
def openurlitem(item):
fp=urllib.urlopen(item)
data=fp.read().replace('\n', '')
return data
def process_item(url_link):
parser = myhtmlparser(url_link)
parser.feed(openurlitem(url_link))
parser.doDownload()
class mylisthtmlparser(myhtmlparser):
def __init__(self,link):
self.reset()
self.inLink=False
self.inResultList=False
self.domain=self.get_domain(link)
self.nextlink=""
self.pagination=False
def handle_starttag(self, tag, attrs):
if(self.inResultList):
if(tag == 'a' and ('class','title') in attrs):
self.inLink=True
self.pdflink = self.getHref(attrs)
if(tag == 'ol' and ('id','results-list') in attrs):
self.inResultList=True
if(tag=='form' and ('class','pagination') in attrs):
self.pagination = True
if(self.pagination and tag == 'a' and ('class','next') in attrs and ('title','next') in attrs):
self.nextlink = self.pdflink = self.getHref(attrs)
print "next link = "+self.nextlink
def handle_endtag(self, tag):
if(tag == 'ol' and self.inResultList):
self.inResultList=False
if(tag == 'form' and self.pagination):
self.pagination= False
def handle_data(self, data):
if(self.inLink):
print "Opening "+data
print "url link "+self.pdflink
try:
process_item(self.pdflink)
except:
print "error : "
self.inLink=False
def hasNext(self):
return self.nextlink.startswith("http")
def getHref(self,attrs):
for pair_val in attrs:
if(pair_val[0]=='href'):
return self.domain+pair_val[1]
def process_page(url):
parser = mylisthtmlparser(url)
parser.feed(openurlitem(url))
while(parser.hasNext()):
url = parser.nextlink
parser = mylisthtmlparser(url)
parser.feed(openurlitem(url))
if(len(sys.argv) >= 2):
args=sys.argv[1:]
if(len(args)==2):
if args[0]=="-s":
try:
process_page(args[1])
except:
print "processing error. check your url"
print "format: 'python download-books-from-rss-springer.py [-s] url'"
else:
print "format: 'python download-books-from-rss-springer.py [-s] url'"
elif len(args)==1:
try:
process_item(args[0])
except:
print "processing error. check your url"
print "format: 'python download-books-from-rss-springer.py [-s] url'"
else:
print "format: 'python download-books-from-rss-springer.py [-s] url'"
print "if argument -s is passed, the script will assume the link is a rss feed with a list of books"
print "otherwise, it will assume the link is a url of a single book"
else:
print "python download-books-from-rss-springer.py [-s] url"
# process_page("http://rd.springer.com/search?facet-series=%223214%22&facet-content-type=%22Book%22&showAll=false")

 

 

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s