library(fpp2)
library(gridExtra)
library(grid)
library(quantmod)
library(RCurl)
library(dtwclust)
library(dtw)
library(stargazer)
library(dplyr)
library(sqldf)
# Get the S&P 500 companys' information
# Drop some companies due to data error
download = getURL("https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents.csv")
SP500 = read.csv(text=download)
head(SP500)
SP500 = SP500[-which(SP500$Symbol %in% c("BRK.B","MON","UA","WELL")),]
# Acquire stock price information with quantmod package
stock_price = list()
stock_name = NULL
for(ticker in sub_SP500$Symbol){
print(ticker)
stock = getSymbols(ticker,src='yahoo',from = '2015-07-01', auto.assign = F)[,4]
stock_price = c(stock_price,list(stock))
stock_name = c(stock_name,ticker)
Sys.sleep(2)
}
names(stock_price) = stock_name
# Save the final output
# save(stock_price, file="./data/stock_price.rds")
load('./data/stock_price.rds')
# Make sure the data quality
summary(sapply(stock_price,length))
less_data = names(which(sapply(stock_price,length) < 780))
SP500[SP500$Symbol %in% less_data,]
# Clean up data
stock_price = stock_price[-which(names(stock_price) %in% less_data)]
summary(sapply(stock_price,length))
summary(sapply(stock_price,function(x) sum(is.na(x))))
na_problem = sapply(stock_price,function(x) sum(is.na(x)))
na_problem[na_problem > 0]
stock_price = stock_price[-which(names(stock_price) == "BKNG")]
length(stock_price)
# normalized stock price
normalized = function(x){
m = mean(x)
s = sd(x)
n = (x-m)/s
return(n)
}
normalized_price = lapply(stock_price,function(x) normalized(x))
# cluster
Final = NULL
for(i in seq(length(normalized_price))){
d = data.frame(ind = seq(length(normalized_price[[i]])),stock_name = names(normalized_price)[i], price = normalized_price[[i]])
names(d) = c("ind","stock_name","price")
Final = rbind(Final,d)
}
Final_SP = NULL
for(i in seq(length(stock_price))){
d = data.frame(ind = seq(length(stock_price[[i]])),stock_name = names(stock_price)[i], price = stock_price[[i]])
names(d) = c("ind","stock_name","price")
Final_SP = rbind(Final_SP,d)
}
# save(Final,Final_SP,file="./data/Stock_Price_DataFrame.rds")
load('./data/Stock_Price_DataFrame.rds')
# standardized price time series
label_date = index(stock_price[[1]])
brek = c(1,200,400,600,780)
ggplot(data=Final,aes(x=ind,y=price))+geom_line(aes(colour=stock_name))+ theme(legend.position="none")+
geom_smooth(method='lm',formula=y~x,linetype="dashed")+scale_x_continuous(breaks=brek,labels=label_date[brek])+
xlab("date")+ylab("standardized stock price")
# time series clusters
dtw_cluster = tsclust(normalized_price, type="partitional",k=20,
distance="dtw_basic",centroid = "pam",seed=1234,trace=T,
args = tsclust_args(dist = list(window.size = 20L)))
plot(dtw_cluster)
# We decided to Use 10 clusters with dtw with window size = 5 in the end
dtw_cluster = tsclust(normalized_price, type="partitional",k=10,
distance="dtw_basic",centroid = "pam",seed=1234,trace=T,
args = tsclust_args(dist = list(window.size = 5)))
plot(dtw_cluster)
cluster_data = data.frame(stock_name=names(normalized_price),cluster=dtw_cluster@cluster)
cluster_data = sqldf('select t1.*, t2.Name, t2.Sector from cluster_data t1 left join SP500 t2 on t1.stock_name = t2.Symbol')
head(cluster_data)
# Analyze each cluster
Final = sqldf('select t1.*, t2.cluster from Final t1 left join cluster_data t2 on t1.stock_name = t2.stock_name')
Final_SP = sqldf('select t1.*, t2.cluster from Final_SP t1 left join cluster_data t2 on t1.stock_name = t2.stock_name')
ggplot(data=Final,aes(x=ind,y=price))+geom_line(aes(colour=stock_name))+facet_wrap(~cluster)+
theme(legend.position="none")+geom_smooth(method='auto',linetype="dashed")+
xlab("date")+ylab("normalized price")+ggtitle("10 Clusters with dtw distance")+
scale_x_continuous(breaks=brek,labels=label_date[brek])+theme(axis.text.x=element_text(angle = 90))
# Calculate returns
cluster_data$cluster = as.factor(cluster_data$cluster)
Final_ANA1 = split(Final,f=Final$cluster)
Final_SP_ANA1 = split(Final_SP,f=Final_SP$cluster)
Increase_Var = function(x){
t = merge(x[x$ind==780,c("stock_name","price")],x[x$ind==1,c("stock_name","price")],by="stock_name")
dif = t$price.x - t$price.y
return(dif)
}
dif_var = sapply(Final_ANA1,function(x) mean(Increase_Var(x)))
dif_var_real = sapply(Final_SP_ANA1,function(x) mean(Increase_Var(x)))
dif_var_return_sd = sapply(Final_ANA1,Increase_Var)
return_dataframe_sd = NULL
for(i in names(dif_var_return_sd)){
d = data.frame("cluster"=i,"return"=dif_var_return_sd[i])
names(d) = c("cluster","return")
return_dataframe_sd = rbind(return_dataframe_sd,d)
}
ggplot()+geom_bar(aes(x=as.character(index(dif_var)),y=dif_var,fill = ifelse(dif_var<0,"red","green")),stat='identity')+
theme(legend.position = "none")+xlab("Cluster")+ylab("Diff (sd)")
ggplot()+geom_bar(aes(x=as.character(index(dif_var_real)),y=dif_var_real,fill = ifelse(dif_var_real<0,"red","green")),stat='identity')+
theme(legend.position = "none")+xlab("Cluster")+ylab("Gross Spread ($)")
ggplot(data=return_dataframe_sd,aes(x = return))+geom_density()+facet_wrap(~cluster)+
xlab("standardized gross spread")+theme(axis.text.x=element_text(angle = 45))