-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathpythonForNewsSina.py
More file actions
50 lines (50 loc) · 2.13 KB
/
pythonForNewsSina.py
File metadata and controls
50 lines (50 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import requests
import json
import pandas
from datetime import datetime
from bs4 import BeautifulSoup
url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1511263184507'
commentUrl='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&jsvar=loader_1511189047225_94708810'
def getCommentCount(url):
newsid=url.split('/')[-1].lstrip('doc-i').rstrip('.shtml')
jd=json.loads(requests.get(commentUrl.format(newsid)).text.lstrip('var loader_1511189047225_94708810='))
return jd['result']['count']['total']
#获取新闻所有信息方法封装
def getNewsDetails(url):
result={}
res=requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
#标题
result['title']=soup.select('#artibodyTitle')[0].text
timesource=soup.select('.time-source')[0].contents[0].strip()
#日期 字符串转时间strptime 时间转字符串strftime
result['dt']=datetime.strptime(timesource,'%Y年%m月%d日%H:%M').strftime('%Y-%m-%d')
#来源
result['source']=soup.select('.time-source span a')[0].text
#正文
article=[]
for p in soup.select('.article p')[:-1]:
article.append(p.text.strip())
txt=''.join(article)
result['txt']=txt
#编辑人
result['editor']=soup.select('.article-editor')[0].text.lstrip('责任编辑:')
result['comments']=getCommentCount(url)
return result
#获取每个分页下的所有新闻链接item['url'],获取每个连接下的明细
def parseListUrl(url):
newsdetails=[]
res=requests.get(url)
jd=json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))
for item in jd['result']['data']:
newsdetails.append(getNewsDetails(item['url']))
return newsdetails
#爬取前三页的所有新闻明细
news_total=[]
for i in range(1,2):
newsurl=url.format(i)
newsary=parseListUrl(newsurl)
news_total.extend(newsary)
df=pandas.DataFrame(news_total)
df.head(15)