In [2]:
# -*- coding: utf-8 -*-
from gensim.models import word2vec
from gensim import models
import pandas as pd
import numpy as np
import datetime
import codecs
import re
import matplotlib.pyplot as plt
%matplotlib notebook
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='/System/Library/Fonts/STHeiti Light.ttc')
In [3]:
import jieba
jieba.load_userdict("../data/Trump_dict.txt")
Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/z8/g1n0vfrn7wdg2c3dz9q7zjm80000gn/T/jieba.cache
Loading model cost 2.334 seconds.
Prefix dict has been built succesfully.
In [5]:
article = pd.read_csv('../data/ptt_article.csv',encoding='utf-8')
push = pd.read_csv('../data/ptt_push.csv',encoding='utf-8')
article.loc[:,'dt'] = article.dt.map(lambda x:datetime.datetime.strptime(x,"%Y-%m-%d"))
push = pd.merge(push,article[['link','dt']],on='link',how='left')
In [6]:
#建立要進行Woed2Vec的時間軸
start_dt = []
end_dt = []
c = range(27)
for i in c:
    lb = datetime.datetime(2016,6,26) + datetime.timedelta(days=7*i)
    start_dt.append(lb)
for i in c:
    lb = datetime.datetime(2016,7,2) + datetime.timedelta(days=7*i)
    end_dt.append(lb)
print start_dt[-1:]
print end_dt[-1:]
[datetime.datetime(2016, 12, 25, 0, 0)]
[datetime.datetime(2016, 12, 31, 0, 0)]
In [8]:
#計算川普在各個時間點最相關的詞彙
keyword = pd.DataFrame(columns=['dt','k1','k2','k3','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15','k16','k17','k18','k19','k20'])
for i in c:
    A1 = article[(article.dt > start_dt[i])&(article.dt < end_dt[i])]['content']
    A2 = push[(push.dt > start_dt[i])&(push.dt < end_dt[i])]['content']
    AA = A1.append(A2)
    AA = AA.fillna('')
    with codecs.open('Cont.txt','w','utf-8') as w:
        for x in AA:
            Fin=[]
            L = re.split(' |,|。|\(|\)|,|「|」|;|;|?|)|(|\n',x)
            for line in L:
                if line != None:
                    CList = list(jieba.cut(line))
                    Fin.extend(CList)
            w.write(' '.join(Fin)+'\n')
    sentences = word2vec.LineSentence('Cont.txt')  
    model = word2vec.Word2Vec(sentences)
    print start_dt[i].strftime("%Y-%m-%d")+"   "+str(i)
    try:
        k_list = list(model.most_similar(u"川普",topn=20))
        line = pd.DataFrame({'dt':start_dt[i],'k1':k_list[0],'k2':k_list[1],'k3':k_list[2],'k4':k_list[3],'k5':k_list[4],'k6':k_list[5],
                                'k7':k_list[6],'k8':k_list[7],'k9':k_list[8],'k10':k_list[9],'k11':k_list[10],'k12':k_list[11],
                                'k13':k_list[12],'k14':k_list[13],'k15':k_list[14],'k16':k_list[15],'k17':k_list[16],'k18':k_list[17],'k19':k_list[18],'k20':k_list[19]})
        print "Finished"
        keyword = keyword.append(line)
    except:
        print "No 川普 Key word"
2016-06-26   0
Finished
2016-07-03   1
No 川普 Key word
2016-07-10   2
No 川普 Key word
2016-07-17   3
Finished
2016-07-24   4
Finished
2016-07-31   5
Finished
2016-08-07   6
Finished
2016-08-14   7
Finished
2016-08-21   8
Finished
2016-08-28   9
Finished
2016-09-04   10
Finished
2016-09-11   11
Finished
2016-09-18   12
Finished
2016-09-25   13
Finished
2016-10-02   14
Finished
2016-10-09   15
Finished
2016-10-16   16
Finished
2016-10-23   17
Finished
2016-10-30   18
Finished
2016-11-06   19
Finished
2016-11-13   20
Finished
2016-11-20   21
Finished
2016-11-27   22
Finished
2016-12-04   23
Finished
2016-12-11   24
Finished
2016-12-18   25
Finished
2016-12-25   26
Finished
In [10]:
keyword.to_csv('../outcome/Trump_keyword.csv',encoding='utf-8',index=False)
keyword.head()
Out[10]:
dt k1 k10 k11 k12 k13 k14 k15 k16 k17 ... k19 k2 k20 k3 k4 k5 k6 k7 k8 k9
0 2016-06-26 做出 承受 誇張 都給 遊戲 不確 除息日 受到 ... 員工 超級 這篇 短線 操盤 情緒 這點 不斷 合理
1 2016-06-26 0.994538 0.993149 0.993057 0.99299 0.992776 0.992747 0.992598 0.992469 0.992385 ... 0.99231 0.994336 0.992281 0.994196 0.994185 0.993664 0.993465 0.993409 0.993379 0.993154
0 2016-07-17 大概 下來 電信 上市 提到 確定 虧損 9500 ... 估計 費用 交割 520 一年 單日 拉到 股利 上奇 投資人
1 2016-07-17 0.995477 0.993776 0.993738 0.993733 0.993466 0.993438 0.993374 0.99323 0.993146 ... 0.993067 0.99456 0.993023 0.994559 0.994223 0.994149 0.994133 0.994071 0.994025 0.993938
0 2016-07-24 希望 不過 但是 問題 沒用 應用 每次 這麼 ... 的話 其實 他們 幾年 空間 一定

5 rows × 21 columns

In [12]:
#計算「利率」在各個時間點與「川普」的關聯度
Interest = pd.DataFrame(columns=['dt','similarity'])
for i in c:
    A1 = article[(article.dt > start_dt[i])&(article.dt < end_dt[i])]['content']
    A2 = push[(push.dt > start_dt[i])&(push.dt < end_dt[i])]['content']
    AA = A1.append(A2)
    AA = AA.fillna('')
    with codecs.open('Cont.txt','w','utf-8') as w:
        for x in AA:
            Fin=[]
            L = re.split(' |,|。|\(|\)|,|「|」|;|;|?|)|(|\n',x)
            for line in L:
                if line != None:
                    CList = list(jieba.cut(line))
                    Fin.extend(CList)
            w.write(' '.join(Fin)+'\n')
    sentences = word2vec.LineSentence('Cont.txt')  
    model = word2vec.Word2Vec(sentences)
    print start_dt[i].strftime("%Y-%m-%d")+"   "+str(i)
    try:
        s = model.similarity(u'川普', u'升息')
        line = pd.DataFrame([{'dt':start_dt[i],'similarity':s}])
        Interest = Interest.append(line)
    except:
        print 'No key word'
        line = pd.DataFrame([{'dt':start_dt[i],'similarity':0}])
        Interest = Interest.append(line)
2016-06-26   0
2016-07-03   1
No key word
2016-07-10   2
No key word
2016-07-17   3
2016-07-24   4
2016-07-31   5
2016-08-07   6
2016-08-14   7
2016-08-21   8
2016-08-28   9
2016-09-04   10
2016-09-11   11
2016-09-18   12
2016-09-25   13
2016-10-02   14
2016-10-09   15
2016-10-16   16
2016-10-23   17
2016-10-30   18
2016-11-06   19
2016-11-13   20
2016-11-20   21
2016-11-27   22
2016-12-04   23
2016-12-11   24
2016-12-18   25
2016-12-25   26
In [13]:
p0 = Interest[Interest.dt >= datetime.datetime(2016,7,17)]
plt.figure()
plt.plot(p0.dt,p0.similarity,c='red')
plt.ylim([0.5,1.2])
plt.title(u"升息與川普相關度",fontproperties=myfont,fontsize=20)
plt.xlabel(u'日期',fontproperties=myfont)
plt.ylabel(u'相似度',fontproperties=myfont)
plt.show()