databaseR/DataDistribution.R at master · ReasonFoundation/databaseR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
## Data Distribution ##
## Data: Quarterly GDP by State from Bureau of Economic Analysis
## by Anil Niraula

#==Release: https://bea.gov/newsreleases/regional/gdp_state/qgdpstate_newsrelease.htm
#==Tables only: https://bea.gov/newsreleases/regional/gdp_state/2018/xls/qgdpstate0518.xlsx
#Packages required: XLConnect, fitdistrplus

#### Key Takeaways
## Most data is not normally distributed (indicator# 1: difference between mean & median)
## Use boxplot() + "fitdistrplus" package to check Distribution & Outliers
## Consider removing Outliers & use log10 transformation to get clearer distribution
## If your data is skewed, try to increase sample size or "resample" for better fit

#Extracting downloaded and slightly pre-processed GDP by state data from BEA (2017, Q4)
rm(list=ls())
###Load/install packages
#R.Version()
#https://github.com/ReasonFoundation/pensionviewr
#Create token -> usethis::edit_r_environ() -> restart -> Sys.getenv("GITHUB_PAT")
#install.packages('devtools')
#library(devtools)
#devtools::install_github("ReasonFoundation/reasontheme",force = TRUE)
#devtools::install_github("ReasonFoundation/pensionviewr", force = TRUE)
library(reasontheme)
library(pensionviewr)
library(ggplot2)
library(tidyverse)
library(tseries)
library(data.table)
library(readr)
library(rsconnect)
library(plyr)
library(tseries)
library(openxlsx)
library(rsconnect)
library(base64enc)
#Shiny--
library(shiny)
library(shinyWidgets)
library(shinymanager)
library(repmis)
library(plotly)
#library(shinyFiles)
#Other---
library(XLConnect)
library(fitdistrplus)

#Sources:
#https://www.investopedia.com/articles/06/probabilitydistribution.asp

##Loading State GDP data from GitHub:
#https://github.com/ReasonFoundation/databaseR/tree/master/files/tips
urlfile <- "https://raw.githubusercontent.com/ReasonFoundation/databaseR/master/files/tips/qgdpstate0518_truncated.csv"
GDPSt <- data.table(
  read_csv(url(urlfile), col_names = TRUE, na = c(""), skip_empty_rows = TRUE, col_types = NULL)
)

#GDPSt = readWorksheetFromFile(url, header = FALSE,
#                              sheet = 4, startRow = 4, startCol = 1, endCol = 17, endRow = 65)
#View(GDPSt)

#Excluding regional totals
#("New England", "Midwest", "Great Lakes", "Plains",
#"Southeast", "Southwest", "Rocky Mountain", and "Far West")
StateGDP <-as.data.frame(GDPSt[-c(8, 15, 21, 29, 42, 47, 53, 60:61),c(1,9)]) #convert GDP to $Millions
StateGDP <- StateGDP %>%
  select(StateGDP[,1], StateGDP[,9]) %>%
  arrange(desc(StateGDP[,1]))

colnames(StateGDP) <- c("State", "GDP_State")
StateGDP$GDP_State <- as.numeric(StateGDP$GDP_State)/1000
View(StateGDP)

#Number of non-zero observations
n = sum(!is.na(StateGDP$GDP_State))
n

#### Summary Statistic ####
params <- StateGDP %>%
  summarize(mean = round(mean(as.matrix(StateGDP$GDP_State)),1),
            median = round(median(as.matrix(StateGDP$GDP_State)),1),
            sd = round(sd(as.matrix(StateGDP$GDP_State)),1))
View(params)

#as.data.frame(replicate(100,sample(StateGDP)))
#sample(StateGDP,1, replace=T)

summary(StateGDP$GDP_State)
#GDP by State
#Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#32.59   96.67  231.46  397.39  517.31 2802.29

#Considerable difference between mean and median
#can show that the data might not be normally distributed

#To plot the data we need to convert it back to "numeric" class
GSPSt <- as.matrix(StateGDP$GDP_State)
GSP <- as.numeric(GSPSt)
#View(GSP)

#### Normal Distribution ####

library(moments)

n.sample <- rnorm(n = 100000, mean = params$mean, sd = params$sd)
skewness(n.sample)
kurtosis(n.sample)

datasim <- data.frame(n.sample)
ggplot(datasim, aes(x = n.sample), binwidth = 1) +
  geom_histogram(aes(y = ..density..), fill = 'red', alpha = 0.5) +
  geom_density(colour = 'blue') + xlab(expression(bold('Simulated Samples'))) +
  ylab(expression(bold('Density')))

#### Skewed Distribution ####

#Is distribution skewed (skewness + mean/median)?

#Plot the data, CDF, and density
#(historgams work best for simple visualization of underlying distribution)
hist(GSP)

## Using log10 transformation to spread the distribution (for visualization)
hist(log10(GSP))
#Plot CDF (i.e. cumulative distribution function for continuous variables)
plot(ecdf(GSP))
#https://stats.libretexts.org/Bookshelves/Introductory_Statistics/Book%3A_Introductory_Statistics_(Shafer_and_Zhang)/02%3A_Descriptive_Statistics/2.05%3A_The_Empirical_Rule_and_Chebyshev%27s_Theorem
skewness(GSP)
#[1] 2.898876
#Moderately skewed to the right

#Density is another relevant way to show where data points are concentrated
plot(density(GSP))
abline(v = mean(GSP), col = "green") #drawing a vertical line for the average GDP
abline(v = median(GSP), col = "blue") #drawing a vertical line for median GDP
abline(v = mean(GSP)+sd(GSP), col = "lightblue") #drawing a vertical line that is 1 standard deviation from the average GDP
abline(v = mean(GSP)-sd(GSP), col = "orange")

#Law of large numbers (resampling)
mean(as.matrix(StateGDP$GDP_State))
mean(rnorm(10000, mean=mean(as.matrix(StateGDP$GDP_State)), sd=sd(as.matrix(StateGDP$GDP_State))))

#### Outliers + Range Distributions ####

#Creating a binary vector of states with GDP +- one standard deviation
St.dev <- ifelse(GSP>=mean(GSP)-sd(GSP) & GSP<=mean(GSP)+sd(GSP),1,0)
mean(GSP)-sd(GSP)
mean(GSP)+sd(GSP)
table(St.dev)# 47 out of 51 states are within this range
sum(St.dev)/length(St.dev) # 0.903 or 90%. Thus, 10% or 5 states are outside this range
#Empirical rule of thumb: Chebyshev's Theorem = 1−1/k^2 (k = number of standard deviations)
1-1/3^2# At least 89% of all not normally distributed data should be with 3 sd of the mean
#Boxplot is another handy tool to spot outliers, it shows 1st, 2nd (median), and 3rd Quartiles
#as well as maximum and minimum

#Outliers could potentially be spotted by looking at data points that are 1.5×IQR (interquartile range = Q3-Q1)
#away from either the third or the first quartile.
glimpse(boxplot(StateGDP$GDP_State))
StateGDP <- data.frame(StateGDP)
View(StateGDP %>%
       filter(GDP_State >= 1564))
#California, Texas, and New York have the highest GDP (potential outliers), followed by Florida.

#### Removing Outliers ####
#Excluding these three states for a prompt visual
sample1 <- StateGDP %>%
  filter(State != "New York", State != "Texas", State !="California")

plot(ecdf(sample1[,2]))

## Examples of distributions
x<-seq(from=0,to=6,length.out=100)
ylim<-c(0,0.6)
par(mfrow=c(2,2)) # Create a 2x2 plotting area >
plot(x,dunif(x,min=2,max=4),main="Uniform",type="l",ylim=ylim)
plot(x,dnorm(x,mean=3,sd=1),main="Normal",type="l",ylim=ylim)
plot(x,dexp(x,rate=1/2),main="Exponential",type="l",ylim=ylim) #Plot an Exponential density >
plot(x,dgamma(x,shape=2,rate=1),main="Gamma",type="l",ylim=ylim) #Plot a Gamma density

#### Fit Data Distribution (need library "fitdistrplus") ####
#Run the comparative distribution analysis
descdist(GSP, discrete = FALSE)
#Major types of distribution:
#"norm", "lnorm", "pois", "exp", "gamma", "nbinom", "geom", "beta", "unif" and "logis"
#Fit couple of distributions
#Try fit normal distribution
fit.norm <- fitdist(GSP, "norm")
plot(fit.norm)
fit.norm$aic #Akaike information criterion shows relative quality of statistical models for a given dataset
#It is only useful when comparing models to each other, as AIC estimator of out-of-sample prediction error.
#The purpose would be to find model that minimizes the information loss (i.e. has the minimum AIC)
#Try fit lognormal distribution
fit.lognorm <- fitdist(GSP, "lnorm")
plot(fit.lognorm)
fit.lognorm$aic
#Try fit exponential distribution
fit.exp <- fitdist(GSP, "exp")
plot(fit.exp)
fit.exp$aic
#Try fit logistic distribution
fit.logis <- fitdist(GSP, "logis")
plot(fit.logis)
fit.logis$aic

#Given the AIC values for the four distributions, our data is best represented by lognormal distribution,
#folowed by exponential, logistic, and then by normal distributions
#https://en.wikipedia.org/wiki/Log-normal_distribution