# encoding: utf-8
import numpy as np
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='/System/Library/Fonts/STHeiti Light.ttc')
import datetime
import jieba
import re
import codecs
article = pd.read_csv('../data/ptt_article.csv',encoding='utf-8')
push = pd.read_csv('../data/ptt_push.csv',encoding='utf-8')
article.loc[:,'dt'] = article.dt.map(lambda x:datetime.datetime.strptime(x,"%Y-%m-%d"))
push = pd.merge(push,article[['link','dt']],on='link',how='left')
article = article.fillna('')
#由於有些文章content會是missing值(NA),pandas的格式就會變成數值格式將導致後續資料分析有誤,因此必須先將遺漏值格式改為字串格式
print article.shape
article.head(3)
print push.shape
push.head(3)
pattern = u'樂陞|百尺竿頭|3662' #可放入regular expression的pattern
LS_title = article[article.title.str.contains(pattern)]["link"]
LS_content = article[article.content.str.contains(pattern)]["link"]
print "標題包含「樂陞」或「百尺竿頭」文章數: "+ str(len(set(LS_title)))
print "內文包含「樂陞」或「百尺竿頭」文章數: "+ str(len(set(LS_content)))
print "標題和內文都包含「樂陞」或「百尺竿頭」文章數: "+str(len(list(set(LS_title).intersection(set(LS_content)))))
article['LS_Ind'] = 0
article.ix[article.link.isin(LS_title),'LS_Ind'] = 1
article.ix[article.link.isin(LS_content),'LS_Ind'] = 1
plot = article.groupby('dt')['LS_Ind'].sum()
plot = plot.reset_index()
plot = plot[plot.dt >= datetime.datetime(2016,6,1)]
plt.figure(figsize=(10,5))
plt.plot(plot.dt,plot.LS_Ind,lw=2,c='#0000FF')
plt.title(u'PTT股票版提到樂陞時間分布',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'文章數量',fontproperties=myfont)
plt.show()
#加入英國脫歐的時間分佈,兩者可進行比較
pattern = u'脫歐|英國|歐盟'
Briexit_title = article[article.title.str.contains(pattern)]["link"]
Briexit_content = article[article.content.str.contains(pattern)]["link"]
article.loc[:,'Briexit_Ind'] = 0
article.loc[article.link.isin(Briexit_title),'Briexit_Ind'] = 1
article.loc[article.link.isin(Briexit_content),'Briexit_Ind'] = 1
plot_2 = article.groupby('dt')['Briexit_Ind'].sum()
plot_2 = plot_2.reset_index()
plot_2 = plot_2[plot_2.dt >= datetime.datetime(2016,6,1)]
plt.figure(figsize=(10,5))
plt.plot(plot.dt,plot.LS_Ind,lw=2,c='#0000FF')
plt.plot(plot_2.dt,plot_2.Briexit_Ind,lw=2,c="#04B404")
plt.title(u'PTT股票版提到樂陞/脫歐文章時間分布',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'文章數量',fontproperties=myfont)
plt.legend([u'樂陞',u'脫歐'],prop=myfont)
plt.show()
#將提到樂陞的文章底下的推噓文數量進行計算
push.loc[:,'LS_Ind'] = 0
push.loc[push.link.isin(LS_title),'LS_Ind'] = 1
push.loc[push.link.isin(LS_content),'LS_Ind'] = 1
plot2 = push.groupby(['dt','tag'])['LS_Ind'].sum()
plot2 = plot2.reset_index()
plot2 = plot2[plot2.dt >= datetime.datetime(2016,6,1)]
plot2.head()
#視覺化呈現樂生案推噓文的時間變化
p1 = plot2[plot2.tag == u'\u2192 ']
p2 = plot2[plot2.tag == u'\u5653 ']
p3 = plot2[plot2.tag == u'\u63a8 ']
plt.figure(figsize=(10,5))
plt.plot(p1.dt,p1.LS_Ind,c='#848484')
plt.plot(p2.dt,p2.LS_Ind,c='#FF0000')
plt.plot(p3.dt,p3.LS_Ind,c='#04B404')
plt.title(u'PTT股票版提到樂陞文章時間分布',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'文章數量',fontproperties=myfont)
plt.legend([u'\u2192 ',u'\u5653 ',u'\u63a8 '],prop=myfont)
plt.show()
#呈現被水桶的狀況
pattern = u'\[公告\]'
Announce = article[article.title.str.contains(pattern)]
plot3 = Announce.groupby('dt')['link'].count()
plot3 = plot3.reset_index()
plot3 = plot3[plot3.dt >= datetime.datetime(2016,6,1)]
plt.figure(figsize=(10,5))
plt.plot(plot3.dt,plot3.link,lw=2)
plt.title(u'[公告]數量',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'數量',fontproperties=myfont)
plt.show()
#篩選出樂陞案的文章
LS_article = article[article.LS_Ind == 1]
LS_push = push[push.LS_Ind == 1]
#計算作者的發文量
article_author = LS_article.groupby(['dt','author'])['link'].count()
article_author = article_author.reset_index()
push_author = LS_push.groupby(['dt','author'])['link'].count()
push_author = push_author.reset_index()
#事前
pre_article = article_author[(article_author.dt > datetime.datetime(2016,6,1))&(article_author.dt < datetime.datetime(2016,8,30))][['author','link']]
pre_push = push_author[(push_author.dt > datetime.datetime(2016,6,1))&(push_author.dt < datetime.datetime(2016,8,30))][['author','link']]
pre_article = pre_article.groupby('author').count()
pre_push = pre_push.groupby('author').count()
print pre_article.sort_values(['link'],ascending=False).head(10)
print ''
print pre_push.sort_values(['link'],ascending=False).head(20)
Article1 = pre_article.sort_values(['link'],ascending=False).head(10)
Push1 = pre_push.sort_values(['link'],ascending=False).head(20)
#事中
pre_article = article_author[(article_author.dt > datetime.datetime(2016,8,30))&(article_author.dt < datetime.datetime(2016,9,15))][['author','link']]
pre_push = push_author[(push_author.dt > datetime.datetime(2016,8,30))&(push_author.dt < datetime.datetime(2016,9,15))][['author','link']]
pre_article = pre_article.groupby('author').count()
pre_push = pre_push.groupby('author').count()
print pre_article.sort_values(['link'],ascending=False).head(10)
print ''
print pre_push.sort_values(['link'],ascending=False).head(20)
Article2 = pre_article.sort_values(['link'],ascending=False).head(10)
Push2 = pre_push.sort_values(['link'],ascending=False).head(20)
#事後
pre_article = article_author[(article_author.dt > datetime.datetime(2016,9,15))][['author','link']]
pre_push = push_author[(push_author.dt > datetime.datetime(2016,8,30))][['author','link']]
pre_article = pre_article.groupby('author').count()
pre_push = pre_push.groupby('author').count()
print pre_article.sort_values(['link'],ascending=False).head(10)
print ''
print pre_push.sort_values(['link'],ascending=False).head(20)
Article3 = pre_article.sort_values(['link'],ascending=False).head(10)
Push3 = pre_push.sort_values(['link'],ascending=False).head(20)
#找出哪些人是值得相信的人
A1 = set(Article1.index)
A2 = set(Article2.index)
A3 = set(Article3.index)
#僅在事件發生當下常PO文的人
for i in A2-A1-A3:
print i
#僅事前常PO文的人
for i in A1-A2-A3:
print i
#僅事後常PO文的人
for i in A3-A1-A2:
print i