add initial scraping and message concat
							parent
							
								
									180a5d2d12
								
							
						
					
					
						commit
						bd8af62765
					
				| @ -0,0 +1,7 @@ | ||||
| beautifulsoup4==4.9.0 | ||||
| certifi==2020.4.5.1 | ||||
| chardet==3.0.4 | ||||
| idna==2.9 | ||||
| requests==2.23.0 | ||||
| soupsieve==2.0 | ||||
| urllib3==1.25.9 | ||||
| @ -0,0 +1,66 @@ | ||||
| import requests | ||||
| import re | ||||
| from bs4 import BeautifulSoup | ||||
| 
 | ||||
| def get_blog(): | ||||
|     url = 'https://news.blizzard.com/en-us/' | ||||
|     html = requests.get(url).text | ||||
|     soup = BeautifulSoup(html, 'html.parser') | ||||
| 
 | ||||
|     base_url = 'https://news.blizzard.com' | ||||
| 
 | ||||
|     blog = [] | ||||
| 
 | ||||
|     feature_list_html = soup.find_all(class_='FeaturedArticle-link') | ||||
|     for feature_html in feature_list_html: | ||||
|         image_html = feature_html.find(class_='Card-image') | ||||
|         image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0] | ||||
|         image_url = 'https:'+image_url_fragment | ||||
| 
 | ||||
|         text_list = feature_html.find_all(class_='text-truncate-ellipsis') | ||||
| 
 | ||||
|         blog.append({ | ||||
|             'image': image_url, | ||||
|             'game': text_list[0].contents[0], | ||||
|             'title': text_list[1].contents[0], | ||||
|             'description': '', | ||||
|             'url': base_url+feature_html.attrs['href'], | ||||
|         }) | ||||
| 
 | ||||
|     article_list_html = soup.find_all(class_='ArticleListItem') | ||||
|     for article_html in article_list_html: | ||||
|         image_html = article_html.find(class_='ArticleListItem-image') | ||||
|         image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0] | ||||
|         image_url = 'https:'+image_url_fragment | ||||
| 
 | ||||
|         content_html = article_html.find(class_='ArticleListItem-contentGrid') | ||||
| 
 | ||||
|         blog.append({ | ||||
|             'image': image_url, | ||||
|             'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0], | ||||
|             'title': content_html.find(class_='ArticleListItem-title').contents[0], | ||||
|             'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0], | ||||
|             'url': base_url+article_html.find(class_='ArticleLink').attrs['href'], | ||||
|         }) | ||||
| 
 | ||||
|     return blog | ||||
| def get_body(post): | ||||
|     return ( | ||||
|         post['title']+ | ||||
|         "\n"+ | ||||
|         post['description']+ | ||||
|         "\n"+ | ||||
|         post['url'] | ||||
|     ) | ||||
| def get_formatted_body(post): | ||||
|     return ( | ||||
|         '<a href="'+post['url']+'">'+ | ||||
|         '<h5>'+post['title']+'</h5>'+ | ||||
|         '</a>'+ | ||||
|         '<p>'+post['description']+'</p>' | ||||
|     ) | ||||
| 
 | ||||
| blog = get_blog() | ||||
| for post in blog: | ||||
|     print(get_body(post)) | ||||
|     print(get_formatted_body(post)) | ||||
					Loading…
					
					
				
		Reference in New Issue