add initial scraping and message concat
							parent
							
								
									180a5d2d12
								
							
						
					
					
						commit
						bd8af62765
					
				| @ -0,0 +1,7 @@ | |||||||
|  | beautifulsoup4==4.9.0 | ||||||
|  | certifi==2020.4.5.1 | ||||||
|  | chardet==3.0.4 | ||||||
|  | idna==2.9 | ||||||
|  | requests==2.23.0 | ||||||
|  | soupsieve==2.0 | ||||||
|  | urllib3==1.25.9 | ||||||
| @ -0,0 +1,66 @@ | |||||||
|  | import requests | ||||||
|  | import re | ||||||
|  | from bs4 import BeautifulSoup | ||||||
|  | 
 | ||||||
|  | def get_blog(): | ||||||
|  |     url = 'https://news.blizzard.com/en-us/' | ||||||
|  |     html = requests.get(url).text | ||||||
|  |     soup = BeautifulSoup(html, 'html.parser') | ||||||
|  | 
 | ||||||
|  |     base_url = 'https://news.blizzard.com' | ||||||
|  | 
 | ||||||
|  |     blog = [] | ||||||
|  | 
 | ||||||
|  |     feature_list_html = soup.find_all(class_='FeaturedArticle-link') | ||||||
|  |     for feature_html in feature_list_html: | ||||||
|  |         image_html = feature_html.find(class_='Card-image') | ||||||
|  |         image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0] | ||||||
|  |         image_url = 'https:'+image_url_fragment | ||||||
|  | 
 | ||||||
|  |         text_list = feature_html.find_all(class_='text-truncate-ellipsis') | ||||||
|  | 
 | ||||||
|  |         blog.append({ | ||||||
|  |             'image': image_url, | ||||||
|  |             'game': text_list[0].contents[0], | ||||||
|  |             'title': text_list[1].contents[0], | ||||||
|  |             'description': '', | ||||||
|  |             'url': base_url+feature_html.attrs['href'], | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  |     article_list_html = soup.find_all(class_='ArticleListItem') | ||||||
|  |     for article_html in article_list_html: | ||||||
|  |         image_html = article_html.find(class_='ArticleListItem-image') | ||||||
|  |         image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0] | ||||||
|  |         image_url = 'https:'+image_url_fragment | ||||||
|  | 
 | ||||||
|  |         content_html = article_html.find(class_='ArticleListItem-contentGrid') | ||||||
|  | 
 | ||||||
|  |         blog.append({ | ||||||
|  |             'image': image_url, | ||||||
|  |             'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0], | ||||||
|  |             'title': content_html.find(class_='ArticleListItem-title').contents[0], | ||||||
|  |             'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0], | ||||||
|  |             'url': base_url+article_html.find(class_='ArticleLink').attrs['href'], | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  |     return blog | ||||||
|  | def get_body(post): | ||||||
|  |     return ( | ||||||
|  |         post['title']+ | ||||||
|  |         "\n"+ | ||||||
|  |         post['description']+ | ||||||
|  |         "\n"+ | ||||||
|  |         post['url'] | ||||||
|  |     ) | ||||||
|  | def get_formatted_body(post): | ||||||
|  |     return ( | ||||||
|  |         '<a href="'+post['url']+'">'+ | ||||||
|  |         '<h5>'+post['title']+'</h5>'+ | ||||||
|  |         '</a>'+ | ||||||
|  |         '<p>'+post['description']+'</p>' | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | blog = get_blog() | ||||||
|  | for post in blog: | ||||||
|  |     print(get_body(post)) | ||||||
|  |     print(get_formatted_body(post)) | ||||||
					Loading…
					
					
				
		Reference in New Issue