Thursday 30 May 2013

Scraping Best Selling Products from Amazon with Python and BeautifulSoup

The code here uses BeautifulSoup - a Python module for parsing HTML - to scrape best selling products from Amazon's website. The script first scrapes a list of product categories, then grabs the top 20 best selling products from each of those categories, and prints them to stdout.
The Code

from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from HTMLParser import HTMLParser

def main():
    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.amazon.com/gp/bestsellers/").read())

    categories = []

    # Scrape list of category names and urls
    for category_li in soup.find(attrs={'id':'zg_browseRoot'}).find('ul').findAll('li'):
        category = {}
        category['name'] = html_parser.unescape(category_li.a.string)
        category['url'] = category_li.a['href']

        categories.append(category)

    del soup

    # Loop through categories and print out each product's name, rank, and url.
    for category in categories:
        print category['name']
        print '-'*50

        soup = BeautifulSoup(urlopen(category['url']))

        i = 1
        for title_div in soup.findAll(attrs={'class':'zg_title'}):
            if i ==1:
                print "%d. %s\n    %s" % (i, html_parser.unescape(title_div.a.string), title_div.a['href'].strip())
            i += 1

        print ''

if __name__ == '__main__':
    main()


Source: http://www.elliotbradbury.com/scraping-best-selling-products-amazon-python/

No comments:

Post a Comment