add initial scraping and message concat

6 years ago · bd8af62765
parent 180a5d2d12
commit bd8af62765
2 changed files with 73 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+beautifulsoup4==4.9.0
+certifi==2020.4.5.1
+chardet==3.0.4
+idna==2.9
+requests==2.23.0
+soupsieve==2.0
+urllib3==1.25.9
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,66 @@
+import requests
+import re
+from bs4 import BeautifulSoup
+
+def get_blog():
+    url = 'https://news.blizzard.com/en-us/'
+    html = requests.get(url).text
+    soup = BeautifulSoup(html, 'html.parser')
+
+    base_url = 'https://news.blizzard.com'
+
+    blog = []
+
+    feature_list_html = soup.find_all(class_='FeaturedArticle-link')
+    for feature_html in feature_list_html:
+        image_html = feature_html.find(class_='Card-image')
+        image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0]
+        image_url = 'https:'+image_url_fragment
+
+        text_list = feature_html.find_all(class_='text-truncate-ellipsis')
+
+        blog.append({
+            'image': image_url,
+            'game': text_list[0].contents[0],
+            'title': text_list[1].contents[0],
+            'description': '',
+            'url': base_url+feature_html.attrs['href'],
+        })
+
+    article_list_html = soup.find_all(class_='ArticleListItem')
+    for article_html in article_list_html:
+        image_html = article_html.find(class_='ArticleListItem-image')
+        image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0]
+        image_url = 'https:'+image_url_fragment
+
+        content_html = article_html.find(class_='ArticleListItem-contentGrid')
+
+        blog.append({
+            'image': image_url,
+            'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0],
+            'title': content_html.find(class_='ArticleListItem-title').contents[0],
+            'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0],
+            'url': base_url+article_html.find(class_='ArticleLink').attrs['href'],
+        })
+
+    return blog
+def get_body(post):
+    return (
+        post['title']+
+        "\n"+
+        post['description']+
+        "\n"+
+        post['url']
+    )
+def get_formatted_body(post):
+    return (
+        '<a href="'+post['url']+'">'+
+        '<h5>'+post['title']+'</h5>'+
+        '</a>'+
+        '<p>'+post['description']+'</p>'
+    )
+
+blog = get_blog()
+for post in blog:
+    print(get_body(post))
+    print(get_formatted_body(post))