Added xpath method for obtaining image when normal fails

This commit is contained in:
Joan
2023-07-11 12:56:02 +02:00
parent e3d7db2916
commit d99399f6ac
3 changed files with 23 additions and 6 deletions

View File

@@ -60,9 +60,10 @@ def find_amazon_link(update: Update, context: CallbackContext) -> None:
driver.get(msg) driver.get(msg)
logging.info("Scraping information and closing browser") logging.info("Scraping information and closing browser")
soup = BeautifulSoup(driver.page_source, "lxml") soup = BeautifulSoup(driver.page_source, "lxml")
etree_soup = BeautifulSoup(driver.page_source, "html.parser")
logging.info("Getting title...") logging.info("Getting title...")
title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup) title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup, etree_soup)
if title == "": if title == "":
logging.info(f"Title not found, not a valid product or captcha") logging.info(f"Title not found, not a valid product or captcha")
captcha = AmazonCaptcha.fromdriver(driver) captcha = AmazonCaptcha.fromdriver(driver)
@@ -80,8 +81,9 @@ def find_amazon_link(update: Update, context: CallbackContext) -> None:
driver.get(msg) driver.get(msg)
logging.info("Scraping information") logging.info("Scraping information")
soup = BeautifulSoup(driver.page_source, "lxml") soup = BeautifulSoup(driver.page_source, "lxml")
etree_soup = BeautifulSoup(driver.page_source, "html.parser")
logging.info("Getting title...") logging.info("Getting title...")
title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup) title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup, etree_soup)
if title == "": if title == "":
logging.info(f"Title not found, not a valid product or failed captcha") logging.info(f"Title not found, not a valid product or failed captcha")
return return

View File

@@ -43,7 +43,7 @@ def create_image(product_id, price):
try: try:
product_image = Image.open(f"/app/data/images/products/{product_id}.jpg") product_image = Image.open(f"/app/data/images/products/{product_id}.jpg")
except: except:
product_image = Image.open(f"/app/data/images/placeholder.jpg") product_image = Image.open(f"/app/data/images/placeholder.png")
hpercent = (baseheight / float(product_image.size[1])) hpercent = (baseheight / float(product_image.size[1]))
wsize = int((float(product_image.size[0]) * float(hpercent))) wsize = int((float(product_image.size[0]) * float(hpercent)))
if wsize < wlimit: if wsize < wlimit:

View File

@@ -1,4 +1,11 @@
import helpers import helpers
import logging
from lxml import etree
# Enable logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO
)
def get_title(soup): def get_title(soup):
try: try:
@@ -22,11 +29,19 @@ def get_price(soup):
return price return price
def get_image(soup): def get_image(soup, etree_soup):
try: try:
image = soup.find("img", attrs={'id':'landingImage'}) image = soup.find("img", attrs={'id':'landingImage'})
image = image.get('src') image = image.get('src')
logging.info("Image found")
except AttributeError: except AttributeError:
image = "N/A" try:
logging.info("Couldn't retrieve image, trying with xpath method")
dom = etree.HTML(str(etree_soup))
image = dom.xpath('//*[@id="main-image-container"]/ul/li[4]/span/span/div/img')
image = image[0].get('src')
except AttributeError:
logging.info("Couldn't retrieve image with xpath method, falling back to placeholder")
image = "N/A"
return image return image