from bs4 import BeautifulSoup
import grequests
import pandas as pd
# STEP 1: Create List of URLs from main archive page
def get_urls():
urls = []
for x in range(1,3):
urls.append(f'http://books.toscrape.com/catalogue/page-{x}.html')
print(f'Getting page url: {x}', urls)
return urls
# STEP 2: Async Load HTML Content from page range in step 1
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
print('AsyncRequest object > reqs:', reqs)
resp = grequests.map(reqs)
print('Status Code > resp (info on page):', resp, '\n')
return resp
# Step 3: Extract title, author, date, url, thumb from asynch variable resp containing html elements of all scraped pages.
def parse(resp):
productlist = []
for r in resp:
#print(r.request.url)
sp = BeautifulSoup(r.text, 'lxml')
items = sp.find_all('article', {'class': 'product_pod'})
#print('Items:\n', items)
for item in items:
product = {
'title' : item.find('h3').text.strip(),
'price': item.find('p', {'class': 'price_color'}).text.strip(),
'single_url': 'https://books.toscrape.com/catalogue/' + item.find(('a')).attrs['href'],
'thumbnail': 'https://books.toscrape.com/' + item.find('img', {'class': 'thumbnail'}).attrs['src'],
}
productlist.append(product)
print('Added: ', product)
return productlist
urls = get_urls() # (Step 1)
resp = get_data(urls) # (Step 2)
df = pd.DataFrame(parse(resp)) # (Step 3)
df.to_csv('books.csv', index=False)
The above script works as expected by asynchronously scraping the main archive page or pages for the website https://books.toscrape.com/ using grequests and BeautifulSoup.
Within the archive page it extracts the following book information:
- title
- price
- single product url
- thumbnail url
Issue
I need a way to further extract information from the single product pages for information such as UPC and associate the information back to the main array productlist.
Single Product Page Example: https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
source https://stackoverflow.com/questions/69395263/how-to-scrape-product-pages-using-python-grequests-and-beautifulsoup
Comments
Post a Comment