Download Bird

import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
import re


def generate_url_list(input_url):
    r = requests.get(input_url)
    soup = BeautifulSoup(r.content, 'html.parser')

    page_list = []
    for link in soup.select('a > img'):
        if "org" in link.parent.get("href"):
            page_list.append(link.parent.get("href"))
            continue
        page_list.append("http://www.ejcr.org/teaching-sets/" + link.parent.get("href"))

    print len(page_list)
    return page_list


def generate_pdf_url(page_url):
    for i in page_url:
        # print i
        metadata = []
        metadata.append(i)
        count = 1
        r = requests.get(i)
        soup = BeautifulSoup(r.content, 'html.parser')
        name = str(soup.title).split(" -- ")[1].replace("</title>", "")
        print name
        for link in soup.find_all(href=review_revision_pdf_href):
            download_pdf(urljoin(r.url, link.get("href")), name + ' - ' + str(count))
            count += 1
            metadata.append(link.get("href"))
        write_metadata(metadata)



def review_revision_pdf_href(href):
    return href and re.compile(r"^(?!../).*(review|revision).*\.pdf$", re.IGNORECASE).search(href)

    # return href and ((((re.compile("review", re.IGNORECASE).search(href) or \
    #        re.compile("revision", re.IGNORECASE).search(href)) and\
    #        re.compile("pdf", re.IGNORECASE).search(href)) and not re.compile("../", re.IGNORECASE).search(href)))


def download_pdf(pdf_url, name):
    r = requests.get(pdf_url, stream=True)
    with open(name + '.pdf', 'wb') as f:
        for chunk in r.iter_content(chunk_size=2000):
            f.write(chunk)


def write_metadata(datas, path="metadata.txt"):
    with open(path, 'a') as f:
        for i in datas:
            f.write("%s\n" % i)

if __name__ == "__main__":
    url_list = generate_url_list("http://www.ejcr.org/teaching-sets/teachingsets.html")
    generate_pdf_url(url_list)

blogroll

social