In [1]:
# encoding: utf-8
import numpy as np
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='/System/Library/Fonts/STHeiti Light.ttc')
import datetime
import jieba
import re
import codecs
In [6]:
article = pd.read_csv('../data/ptt_article.csv',encoding='utf-8')
push = pd.read_csv('../data/ptt_push.csv',encoding='utf-8')
article.loc[:,'dt'] = article.dt.map(lambda x:datetime.datetime.strptime(x,"%Y-%m-%d"))
push = pd.merge(push,article[['link','dt']],on='link',how='left')
article = article.fillna('') 
#由於有些文章content會是missing值(NA),pandas的格式就會變成數值格式將導致後續資料分析有誤,因此必須先將遺漏值格式改為字串格式
In [3]:
print article.shape
article.head(3)
(7447, 8)
Out[3]:
dt author title link board content positive negative
0 2016-05-24 murray [新聞] 接受日媒訪問 王雪紅:正在分拆VR事業 https://www.ptt.cc/bbs/Stock/M.1464104526.A.5C... Stock 1.原文連結(必須檢附):2.原文內容:宏達電(2498)董事長王雪紅今接受《日本經濟新聞》... 42 1
1 2016-05-25 abuu0929 [請益] 漲跌家數及加權股價報酬指數歷史資料 https://www.ptt.cc/bbs/Stock/M.1464163596.A.FC... Stock 我想要台股每日上漲家數、下跌家數及加權股價報酬指數的歷史資料,進行分析用當然我知道可以一天一... 2 0
2 2016-05-25 ailess2 Re: [心得] 廈門開銀行及證券戶經驗分享 https://www.ptt.cc/bbs/Stock/M.1464125274.A.0B... Stock (吃掉一些字,不好意思)分享一下自己開戶的經驗這個不一定必要,看時間安排中國所有的銀行跟證券... 4 0
In [4]:
print push.shape
push.head(3)
(650112, 6)
Out[4]:
link tag author time content dt
0 https://www.ptt.cc/bbs/Stock/M.1464104526.A.5C... alarm911 05/25 00:13\n :,為何不把大虧的手機切出去? 2016-05-24
1 https://www.ptt.cc/bbs/Stock/M.1464104526.A.5C... bcw1218 05/25 07:45\n :有庫藏股護身當然可以這樣講,跑去空剛好變燃料 2016-05-24
2 https://www.ptt.cc/bbs/Stock/M.1464104526.A.5C... Brightheat 05/24 23:57\n :看來明天要漲了反著看 2016-05-24
In [7]:
pattern = u'樂陞|百尺竿頭|3662' #可放入regular expression的pattern
LS_title = article[article.title.str.contains(pattern)]["link"]
LS_content = article[article.content.str.contains(pattern)]["link"]
In [8]:
print "標題包含「樂陞」或「百尺竿頭」文章數: "+ str(len(set(LS_title)))
print "內文包含「樂陞」或「百尺竿頭」文章數: "+ str(len(set(LS_content)))
print "標題和內文都包含「樂陞」或「百尺竿頭」文章數: "+str(len(list(set(LS_title).intersection(set(LS_content)))))
標題包含「樂陞」或「百尺竿頭」文章數: 494
內文包含「樂陞」或「百尺竿頭」文章數: 508
標題和內文都包含「樂陞」或「百尺竿頭」文章數: 354
In [9]:
article['LS_Ind'] = 0
article.ix[article.link.isin(LS_title),'LS_Ind'] = 1
article.ix[article.link.isin(LS_content),'LS_Ind'] = 1
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  
In [10]:
plot = article.groupby('dt')['LS_Ind'].sum()
plot = plot.reset_index()
plot = plot[plot.dt >= datetime.datetime(2016,6,1)]
plt.figure(figsize=(10,5))
plt.plot(plot.dt,plot.LS_Ind,lw=2,c='#0000FF')
plt.title(u'PTT股票版提到樂陞時間分布',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'文章數量',fontproperties=myfont)
plt.show()
In [11]:
#加入英國脫歐的時間分佈,兩者可進行比較
pattern = u'脫歐|英國|歐盟'
Briexit_title = article[article.title.str.contains(pattern)]["link"]
Briexit_content = article[article.content.str.contains(pattern)]["link"]
article.loc[:,'Briexit_Ind'] = 0
article.loc[article.link.isin(Briexit_title),'Briexit_Ind'] = 1
article.loc[article.link.isin(Briexit_content),'Briexit_Ind'] = 1
plot_2 = article.groupby('dt')['Briexit_Ind'].sum()
plot_2 = plot_2.reset_index()
plot_2 = plot_2[plot_2.dt >= datetime.datetime(2016,6,1)]
plt.figure(figsize=(10,5))
plt.plot(plot.dt,plot.LS_Ind,lw=2,c='#0000FF')
plt.plot(plot_2.dt,plot_2.Briexit_Ind,lw=2,c="#04B404")
plt.title(u'PTT股票版提到樂陞/脫歐文章時間分布',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'文章數量',fontproperties=myfont)
plt.legend([u'樂陞',u'脫歐'],prop=myfont)
plt.show()
In [12]:
#將提到樂陞的文章底下的推噓文數量進行計算
push.loc[:,'LS_Ind'] = 0
push.loc[push.link.isin(LS_title),'LS_Ind'] = 1
push.loc[push.link.isin(LS_content),'LS_Ind'] = 1
plot2 = push.groupby(['dt','tag'])['LS_Ind'].sum()
plot2 = plot2.reset_index()
plot2 = plot2[plot2.dt >= datetime.datetime(2016,6,1)]
plot2.head()
Out[12]:
dt tag LS_Ind
24 2016-06-01 50
25 2016-06-01 1
26 2016-06-01 70
27 2016-06-02 25
28 2016-06-02 7
In [13]:
#視覺化呈現樂生案推噓文的時間變化
p1 = plot2[plot2.tag == u'\u2192 ']
p2 = plot2[plot2.tag == u'\u5653 ']
p3 = plot2[plot2.tag == u'\u63a8 ']
plt.figure(figsize=(10,5))
plt.plot(p1.dt,p1.LS_Ind,c='#848484')
plt.plot(p2.dt,p2.LS_Ind,c='#FF0000')
plt.plot(p3.dt,p3.LS_Ind,c='#04B404')
plt.title(u'PTT股票版提到樂陞文章時間分布',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'文章數量',fontproperties=myfont)
plt.legend([u'\u2192 ',u'\u5653 ',u'\u63a8 '],prop=myfont)
plt.show()
In [15]:
#呈現被水桶的狀況
pattern = u'\[公告\]'
Announce = article[article.title.str.contains(pattern)]
plot3 = Announce.groupby('dt')['link'].count()
plot3 = plot3.reset_index()
plot3 = plot3[plot3.dt >= datetime.datetime(2016,6,1)]
plt.figure(figsize=(10,5))
plt.plot(plot3.dt,plot3.link,lw=2)
plt.title(u'[公告]數量',fontproperties=myfont,fontsize=20)
plt.xlabel(u'時間',fontproperties=myfont)
plt.ylabel(u'數量',fontproperties=myfont)
plt.show()
In [16]:
#篩選出樂陞案的文章
LS_article = article[article.LS_Ind == 1]
LS_push = push[push.LS_Ind == 1]
In [17]:
#計算作者的發文量
article_author = LS_article.groupby(['dt','author'])['link'].count()
article_author = article_author.reset_index()
push_author = LS_push.groupby(['dt','author'])['link'].count()
push_author = push_author.reset_index()
In [18]:
#事前
pre_article = article_author[(article_author.dt > datetime.datetime(2016,6,1))&(article_author.dt < datetime.datetime(2016,8,30))][['author','link']]
pre_push = push_author[(push_author.dt > datetime.datetime(2016,6,1))&(push_author.dt < datetime.datetime(2016,8,30))][['author','link']]
pre_article = pre_article.groupby('author').count()
pre_push = pre_push.groupby('author').count()
print pre_article.sort_values(['link'],ascending=False).head(10)
print ''
print pre_push.sort_values(['link'],ascending=False).head(20)
Article1 = pre_article.sort_values(['link'],ascending=False).head(10)
Push1 = pre_push.sort_values(['link'],ascending=False).head(20)
            link
author          
comjj45       12
qxxrbull       9
t288850        9
livethere      7
hijacker       5
sdes6332       4
Lv10           4
spencer222     3
douglasjs      3
ecstasy1       3

              link
author            
Sana            33
hijacker        30
johnps          30
f204137         27
genius721105    26
denispeter      19
teremy          19
panpan          18
aoog5858        17
t288850         17
noar            17
Yuwuen          17
yso0402         16
neobee          16
Kunoichi        16
herikocat       15
ftrain          15
S26             15
ecstasy1        14
kkizikk1636     14
In [19]:
#事中
pre_article = article_author[(article_author.dt > datetime.datetime(2016,8,30))&(article_author.dt < datetime.datetime(2016,9,15))][['author','link']]
pre_push = push_author[(push_author.dt > datetime.datetime(2016,8,30))&(push_author.dt < datetime.datetime(2016,9,15))][['author','link']]
pre_article = pre_article.groupby('author').count()
pre_push = pre_push.groupby('author').count()
print pre_article.sort_values(['link'],ascending=False).head(10)
print ''
print pre_push.sort_values(['link'],ascending=False).head(20)
Article2 = pre_article.sort_values(['link'],ascending=False).head(10)
Push2 = pre_push.sort_values(['link'],ascending=False).head(20)
              link
author            
nightwing       12
IanLi            6
catwalk456       6
tagso            5
qxxrbull         5
livethere        4
YuliGourriel     4
Sirius1812       4
RainCityBoy      4
fish19831012     4

              link
author            
nightwing       14
f204137         14
SweetLee        13
serra28         13
iuowsiq         13
herikocat       13
kurama1984      13
CTC0115         13
ecstasy1        13
gn00295120      12
clubbox         12
goldduck        12
ww              12
genius721105    12
smad            12
rjie            12
Sueo            11
largescale      11
outofthelove    11
hjgx            11
In [20]:
#事後
pre_article = article_author[(article_author.dt > datetime.datetime(2016,9,15))][['author','link']]
pre_push = push_author[(push_author.dt > datetime.datetime(2016,8,30))][['author','link']]
pre_article = pre_article.groupby('author').count()
pre_push = pre_push.groupby('author').count()
print pre_article.sort_values(['link'],ascending=False).head(10)
print ''
print pre_push.sort_values(['link'],ascending=False).head(20)
Article3 = pre_article.sort_values(['link'],ascending=False).head(10)
Push3 = pre_push.sort_values(['link'],ascending=False).head(20)
              link
author            
nightwing       26
qxxrbull         9
f204137          7
cpy5740          6
a125567365       6
fish19831012     6
cheinshin        5
Sirius1812       5
catwalk456       5
femlro           4

              link
author            
SweetLee        67
f204137         63
nightwing       60
Kunoichi        48
CTC0115         45
ecstasy1        45
transcend789    44
s860134         43
ftrain          41
ExtraCream      40
serra28         40
clubbox         40
Sueo            39
rjie            39
herikocat       38
tctv2002        38
bitlife         38
bear753951      38
ww              38
stocktonty      37
In [21]:
#找出哪些人是值得相信的人
A1 = set(Article1.index)
A2 = set(Article2.index)
A3 = set(Article3.index)
In [22]:
#僅在事件發生當下常PO文的人
for i in A2-A1-A3:
    print i
IanLi
RainCityBoy
tagso
YuliGourriel
In [23]:
#僅事前常PO文的人
for i in A1-A2-A3:
    print i
Lv10
hijacker
spencer222
sdes6332
douglasjs
comjj45
ecstasy1
t288850
In [24]:
#僅事後常PO文的人
for i in A3-A1-A2:
    print i
cheinshin
femlro
f204137
cpy5740
a125567365