From 7990d787fc2612f83da108fbaa3ac764b757560e Mon Sep 17 00:00:00 2001 From: lub Date: Sun, 12 Jul 2020 12:42:04 +0200 Subject: [PATCH] refactor category and image into own functions --- scrape.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/scrape.py b/scrape.py index a30d7f5..3e0c0c7 100644 --- a/scrape.py +++ b/scrape.py @@ -16,6 +16,13 @@ def get_accesstoken_from_file(accesstoken_path): accesstoken_file.close() return single_accesstoken +def extract_image_url(image_html): + # only recent articles use "" to escape the url, so we have to search for + # with quotes and without quotes + image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0] + return 'https:'+image_url_fragment +def sanitize_category(raw_category): + return raw_category.replace(' ', '').replace(':', '').lower() def get_blog(): url = 'https://news.blizzard.com/en-us/' html = requests.get(url).text @@ -26,13 +33,11 @@ def get_blog(): blog = [] for featured_article in soup.select('#featured-articles article'): - image_html = featured_article.find(class_='Card-image') - image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0] - image_url = 'https:'+image_url_fragment + image_url = extract_image_url(featured_article.find(class_='Card-image')) text_list = featured_article.select('.text-truncate-ellipsis') - category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower() - title = text_list[1].contents[0] + category = sanitize_category(text_list[0].text) + title = text_list[1].text url = base_url+featured_article.find('a').attrs['href'] @@ -45,11 +50,9 @@ def get_blog(): }) for recent_article in soup.select('#recent-articles article'): - image_html = recent_article.find(class_='ArticleListItem-image') - image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0] - image_url = 'https:'+image_url_fragment + image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image')) - category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower() + category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text) title = recent_article.find(class_='ArticleListItem-title').text description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text