From fed4b15d0815ba89250095fc0b600f0564fc8d32 Mon Sep 17 00:00:00 2001 From: lub Date: Sun, 12 Jul 2020 12:30:03 +0200 Subject: [PATCH] fix scraping --- scrape.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/scrape.py b/scrape.py index 972438e..a30d7f5 100644 --- a/scrape.py +++ b/scrape.py @@ -25,36 +25,42 @@ def get_blog(): blog = [] - feature_list_html = soup.find_all(class_='FeaturedArticle-link') - for feature_html in feature_list_html: - image_html = feature_html.find(class_='Card-image') - image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0] + for featured_article in soup.select('#featured-articles article'): + image_html = featured_article.find(class_='Card-image') + image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0] image_url = 'https:'+image_url_fragment - text_list = feature_html.find_all(class_='text-truncate-ellipsis') + text_list = featured_article.select('.text-truncate-ellipsis') + category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower() + title = text_list[1].contents[0] + + url = base_url+featured_article.find('a').attrs['href'] blog.append({ 'image': image_url, - 'category': text_list[0].contents[0].replace(' ', '').replace(':', '').lower(), - 'title': text_list[1].contents[0], - 'description': '', - 'url': base_url+feature_html.attrs['href'], + 'category': category, + 'title': title, + 'description': '', # featured articles don't have a description + 'url': url, }) - article_list_html = soup.find_all(class_='ArticleListItem') - for article_html in article_list_html: - image_html = article_html.find(class_='ArticleListItem-image') - image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0] + for recent_article in soup.select('#recent-articles article'): + image_html = recent_article.find(class_='ArticleListItem-image') + image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0] image_url = 'https:'+image_url_fragment - content_html = article_html.find(class_='ArticleListItem-contentGrid') + category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower() + title = recent_article.find(class_='ArticleListItem-title').text + description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text + + url = base_url+recent_article.find('a').attrs['href'] blog.append({ 'image': image_url, - 'category': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0].replace(' ', '').replace(':', '').lower(), - 'title': content_html.find(class_='ArticleListItem-title').contents[0], - 'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0], - 'url': base_url+article_html.find(class_='ArticleLink').attrs['href'], + 'category': category, + 'title': title, + 'description': description, + 'url': url }) # reverse order so the oldest article is at [0]