From d99399f6ac3fa6adc72c574c78062b5adca89f96 Mon Sep 17 00:00:00 2001 From: Joan Date: Tue, 11 Jul 2023 12:56:02 +0200 Subject: [PATCH] Added xpath method for obtaining image when normal fails --- bot/bot.py | 6 ++++-- bot/helpers.py | 2 +- bot/parser.py | 21 ++++++++++++++++++--- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/bot/bot.py b/bot/bot.py index 71015cd..46aa5d0 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -60,9 +60,10 @@ def find_amazon_link(update: Update, context: CallbackContext) -> None: driver.get(msg) logging.info("Scraping information and closing browser") soup = BeautifulSoup(driver.page_source, "lxml") + etree_soup = BeautifulSoup(driver.page_source, "html.parser") logging.info("Getting title...") - title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup) + title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup, etree_soup) if title == "": logging.info(f"Title not found, not a valid product or captcha") captcha = AmazonCaptcha.fromdriver(driver) @@ -80,8 +81,9 @@ def find_amazon_link(update: Update, context: CallbackContext) -> None: driver.get(msg) logging.info("Scraping information") soup = BeautifulSoup(driver.page_source, "lxml") + etree_soup = BeautifulSoup(driver.page_source, "html.parser") logging.info("Getting title...") - title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup) + title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup, etree_soup) if title == "": logging.info(f"Title not found, not a valid product or failed captcha") return diff --git a/bot/helpers.py b/bot/helpers.py index d58e98e..def2498 100644 --- a/bot/helpers.py +++ b/bot/helpers.py @@ -43,7 +43,7 @@ def create_image(product_id, price): try: product_image = Image.open(f"/app/data/images/products/{product_id}.jpg") except: - product_image = Image.open(f"/app/data/images/placeholder.jpg") + product_image = Image.open(f"/app/data/images/placeholder.png") hpercent = (baseheight / float(product_image.size[1])) wsize = int((float(product_image.size[0]) * float(hpercent))) if wsize < wlimit: diff --git a/bot/parser.py b/bot/parser.py index c5729b4..65e99cd 100644 --- a/bot/parser.py +++ b/bot/parser.py @@ -1,4 +1,11 @@ import helpers +import logging +from lxml import etree + +# Enable logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO +) def get_title(soup): try: @@ -22,11 +29,19 @@ def get_price(soup): return price -def get_image(soup): +def get_image(soup, etree_soup): try: image = soup.find("img", attrs={'id':'landingImage'}) image = image.get('src') + logging.info("Image found") except AttributeError: - image = "N/A" - + try: + logging.info("Couldn't retrieve image, trying with xpath method") + dom = etree.HTML(str(etree_soup)) + image = dom.xpath('//*[@id="main-image-container"]/ul/li[4]/span/span/div/img') + image = image[0].get('src') + except AttributeError: + logging.info("Couldn't retrieve image with xpath method, falling back to placeholder") + image = "N/A" + return image \ No newline at end of file