Added xpath method for obtaining image when normal fails
This commit is contained in:
@@ -60,9 +60,10 @@ def find_amazon_link(update: Update, context: CallbackContext) -> None:
|
|||||||
driver.get(msg)
|
driver.get(msg)
|
||||||
logging.info("Scraping information and closing browser")
|
logging.info("Scraping information and closing browser")
|
||||||
soup = BeautifulSoup(driver.page_source, "lxml")
|
soup = BeautifulSoup(driver.page_source, "lxml")
|
||||||
|
etree_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
|
||||||
logging.info("Getting title...")
|
logging.info("Getting title...")
|
||||||
title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup)
|
title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup, etree_soup)
|
||||||
if title == "":
|
if title == "":
|
||||||
logging.info(f"Title not found, not a valid product or captcha")
|
logging.info(f"Title not found, not a valid product or captcha")
|
||||||
captcha = AmazonCaptcha.fromdriver(driver)
|
captcha = AmazonCaptcha.fromdriver(driver)
|
||||||
@@ -80,8 +81,9 @@ def find_amazon_link(update: Update, context: CallbackContext) -> None:
|
|||||||
driver.get(msg)
|
driver.get(msg)
|
||||||
logging.info("Scraping information")
|
logging.info("Scraping information")
|
||||||
soup = BeautifulSoup(driver.page_source, "lxml")
|
soup = BeautifulSoup(driver.page_source, "lxml")
|
||||||
|
etree_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
logging.info("Getting title...")
|
logging.info("Getting title...")
|
||||||
title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup)
|
title, price, image = parser.get_title(soup), parser.get_price(soup), parser.get_image(soup, etree_soup)
|
||||||
if title == "":
|
if title == "":
|
||||||
logging.info(f"Title not found, not a valid product or failed captcha")
|
logging.info(f"Title not found, not a valid product or failed captcha")
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ def create_image(product_id, price):
|
|||||||
try:
|
try:
|
||||||
product_image = Image.open(f"/app/data/images/products/{product_id}.jpg")
|
product_image = Image.open(f"/app/data/images/products/{product_id}.jpg")
|
||||||
except:
|
except:
|
||||||
product_image = Image.open(f"/app/data/images/placeholder.jpg")
|
product_image = Image.open(f"/app/data/images/placeholder.png")
|
||||||
hpercent = (baseheight / float(product_image.size[1]))
|
hpercent = (baseheight / float(product_image.size[1]))
|
||||||
wsize = int((float(product_image.size[0]) * float(hpercent)))
|
wsize = int((float(product_image.size[0]) * float(hpercent)))
|
||||||
if wsize < wlimit:
|
if wsize < wlimit:
|
||||||
|
|||||||
@@ -1,4 +1,11 @@
|
|||||||
import helpers
|
import helpers
|
||||||
|
import logging
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# Enable logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO
|
||||||
|
)
|
||||||
|
|
||||||
def get_title(soup):
|
def get_title(soup):
|
||||||
try:
|
try:
|
||||||
@@ -22,11 +29,19 @@ def get_price(soup):
|
|||||||
|
|
||||||
return price
|
return price
|
||||||
|
|
||||||
def get_image(soup):
|
def get_image(soup, etree_soup):
|
||||||
try:
|
try:
|
||||||
image = soup.find("img", attrs={'id':'landingImage'})
|
image = soup.find("img", attrs={'id':'landingImage'})
|
||||||
image = image.get('src')
|
image = image.get('src')
|
||||||
|
logging.info("Image found")
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
try:
|
||||||
|
logging.info("Couldn't retrieve image, trying with xpath method")
|
||||||
|
dom = etree.HTML(str(etree_soup))
|
||||||
|
image = dom.xpath('//*[@id="main-image-container"]/ul/li[4]/span/span/div/img')
|
||||||
|
image = image[0].get('src')
|
||||||
|
except AttributeError:
|
||||||
|
logging.info("Couldn't retrieve image with xpath method, falling back to placeholder")
|
||||||
image = "N/A"
|
image = "N/A"
|
||||||
|
|
||||||
return image
|
return image
|
||||||
Reference in New Issue
Block a user