# -*- coding: utf-8 -*-
from gensim.models import word2vec
from gensim import models
import pandas as pd
import numpy as np
import datetime
import codecs
import re
import matplotlib.pyplot as plt
%matplotlib notebook
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='/System/Library/Fonts/STHeiti Light.ttc')
import jieba
jieba.load_userdict("../data/Trump_dict.txt")
article = pd.read_csv('../data/ptt_article.csv',encoding='utf-8')
push = pd.read_csv('../data/ptt_push.csv',encoding='utf-8')
article.loc[:,'dt'] = article.dt.map(lambda x:datetime.datetime.strptime(x,"%Y-%m-%d"))
push = pd.merge(push,article[['link','dt']],on='link',how='left')
#建立要進行Woed2Vec的時間軸
start_dt = []
end_dt = []
c = range(27)
for i in c:
lb = datetime.datetime(2016,6,26) + datetime.timedelta(days=7*i)
start_dt.append(lb)
for i in c:
lb = datetime.datetime(2016,7,2) + datetime.timedelta(days=7*i)
end_dt.append(lb)
print start_dt[-1:]
print end_dt[-1:]
#計算川普在各個時間點最相關的詞彙
keyword = pd.DataFrame(columns=['dt','k1','k2','k3','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15','k16','k17','k18','k19','k20'])
for i in c:
A1 = article[(article.dt > start_dt[i])&(article.dt < end_dt[i])]['content']
A2 = push[(push.dt > start_dt[i])&(push.dt < end_dt[i])]['content']
AA = A1.append(A2)
AA = AA.fillna('')
with codecs.open('Cont.txt','w','utf-8') as w:
for x in AA:
Fin=[]
L = re.split(' |,|。|\(|\)|,|「|」|;|;|?|)|(|\n',x)
for line in L:
if line != None:
CList = list(jieba.cut(line))
Fin.extend(CList)
w.write(' '.join(Fin)+'\n')
sentences = word2vec.LineSentence('Cont.txt')
model = word2vec.Word2Vec(sentences)
print start_dt[i].strftime("%Y-%m-%d")+" "+str(i)
try:
k_list = list(model.most_similar(u"川普",topn=20))
line = pd.DataFrame({'dt':start_dt[i],'k1':k_list[0],'k2':k_list[1],'k3':k_list[2],'k4':k_list[3],'k5':k_list[4],'k6':k_list[5],
'k7':k_list[6],'k8':k_list[7],'k9':k_list[8],'k10':k_list[9],'k11':k_list[10],'k12':k_list[11],
'k13':k_list[12],'k14':k_list[13],'k15':k_list[14],'k16':k_list[15],'k17':k_list[16],'k18':k_list[17],'k19':k_list[18],'k20':k_list[19]})
print "Finished"
keyword = keyword.append(line)
except:
print "No 川普 Key word"
keyword.to_csv('../outcome/Trump_keyword.csv',encoding='utf-8',index=False)
keyword.head()
#計算「利率」在各個時間點與「川普」的關聯度
Interest = pd.DataFrame(columns=['dt','similarity'])
for i in c:
A1 = article[(article.dt > start_dt[i])&(article.dt < end_dt[i])]['content']
A2 = push[(push.dt > start_dt[i])&(push.dt < end_dt[i])]['content']
AA = A1.append(A2)
AA = AA.fillna('')
with codecs.open('Cont.txt','w','utf-8') as w:
for x in AA:
Fin=[]
L = re.split(' |,|。|\(|\)|,|「|」|;|;|?|)|(|\n',x)
for line in L:
if line != None:
CList = list(jieba.cut(line))
Fin.extend(CList)
w.write(' '.join(Fin)+'\n')
sentences = word2vec.LineSentence('Cont.txt')
model = word2vec.Word2Vec(sentences)
print start_dt[i].strftime("%Y-%m-%d")+" "+str(i)
try:
s = model.similarity(u'川普', u'升息')
line = pd.DataFrame([{'dt':start_dt[i],'similarity':s}])
Interest = Interest.append(line)
except:
print 'No key word'
line = pd.DataFrame([{'dt':start_dt[i],'similarity':0}])
Interest = Interest.append(line)
p0 = Interest[Interest.dt >= datetime.datetime(2016,7,17)]
plt.figure()
plt.plot(p0.dt,p0.similarity,c='red')
plt.ylim([0.5,1.2])
plt.title(u"升息與川普相關度",fontproperties=myfont,fontsize=20)
plt.xlabel(u'日期',fontproperties=myfont)
plt.ylabel(u'相似度',fontproperties=myfont)
plt.show()