SpIntro-Stats/helpers.R at master · MTstateIntroStats/SpIntro-Stats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
## Function to draw and store difference in proportions for randomization

cat1_estimate_shuffles <- function(shuffles, y1, n1){
  shuffles <- as.numeric(shuffles)
  y1new <- NULL

  for(i in 1:shuffles){
    y1_new[i] <- sum(rbinom(n1, 1, y1/n1))
  }
  phat <- y1new/n1
  return(phat)
}

cat2_test_shuffles <- function(shuffles, y1, y2, n1, n2){
  shuffles <- as.numeric(shuffles)
  y1_new <- rhyper(shuffles, y1+y2,(n1 + n2) - (y1 + y2), n1)
  y2_new <- (y1 + y2) - y1_new

  phat.1 <- y1_new/n1
  phat.2 <- y2_new/n2
  diff.p <- phat.1 - phat.2
  data <- as.matrix(cbind(phat.1, phat.2, diff.p), ncol = 3, nrow = shuffles)

  return(data)
}

c1Lurk_shuffles <- function(shuffles, n1, n2, m1){
  bigN <- n1 + n2
  y1_new <- rhyper(as.numeric(shuffles), m=n1, n=n2, k=m1) # repeats, white balls, black balls, # draws
  y2_new <- m1 - y1_new
  phat.1 <- y1_new/n1
  phat.2 <- y2_new/n2
  diff.p <- phat.1 - phat.2
  data.frame( phat1=phat.1, phat2=phat.2, diffp=diff.p, y1=y1_new, y2=y2_new)
}

## Function to draw and store difference in proportions for bootstrap
cat2_estimate_shuffles <- function(shuffles, y1, y2, n1, n2){

    shuffles <- as.numeric(shuffles)
    y1new <- y2new <- NULL

    for(i in 1:shuffles){
    y1new[i] <- sum(rbinom(n1, 1, y1/n1))
    y2new[i] <- sum(rbinom(n2, 1, y2/n2))
    }
    phat.1 <- y1new/n1
    phat.2 <- y2new/n2
    diff.p <- phat.1 - phat.2
    data <- as.matrix(cbind(phat.1, phat.2, diff.p), ncol = 3, nrow = shuffles)

  return(data)
}

c1q1_estimate_shuffles <- function(shuffles, ndx1, ndx2){
  ## create a matrix of shuffled index values.
  ##  top n1 rows are a resample from group 1,
  ##  btm n2 rows, same from group 2
  rbind( matrix(sample(ndx1, length(ndx1) * shuffles, replace = TRUE), ncol = shuffles),
         matrix(sample(ndx2, length(ndx2) * shuffles, replace = TRUE), ncol = shuffles))
}

 ## finding break points for dot plots
newy  <- function(simStats){
  nbreaks <- 0.5*nclass.Sturges(simStats)^2
  z <- cut(simStats, breaks = nbreaks)
  checkBreaks <- (length(simStats) < 3000)
  ## look at center 30 bins to see if we have lots of variation
  ## if so, try more bins
  oldDifQuant <- 100
  while(nbreaks > 30 & checkBreaks){
    zt <- as.numeric(table(z))
    hipt <- which.max(zt)
    zt <- zt[pmax(1, hipt -10):pmin(length(zt), hipt+10)]
    #print(
    difquant <- diff(quantile(zt,c(.75,1)))/ median(zt) #)
    if(difquant > .8 & difquant < oldDifQuant){
      nbreaks <- nbreaks * 1.1
      z <- cut(simStats, breaks = nbreaks)
      oldDifQuant <- difquant
    } else { break()}
  }
  w <- unlist(tapply(z, z, function(V) 1:length(V)))
  w[!is.na(w)]
}

  ## control the way p-values are printed  ##

pvalue2print <- function(extremes, nreps, direction, cutoff, pvalue){
  if(is.na(as.numeric(cutoff))){
    "Not able to interpret your input as a number."
  } else if(extremes > 0){
    if(extremes > nreps)
      extremes <- nreps
    paste(extremes," of ",nreps," values are ",direction," than ", cutoff,
          ". P-value = ", round(pvalue,5))
  } else {
    paste(extremes," of ",nreps," values are ",direction," than ", cutoff,
          ". P-value < 1/", nreps, " or P-value < ", round(1/nreps, ceiling(log10(nreps))+1))
  }
}

## functions for SPINNERS

draws2get1 <-  function( prob, reps){
  ## randomly spins til we get one of the first category
  ## returns the number of spins needed
  nCat <- length(prob)
  prob <-  prob/sum(prob)
  if(nCat < 2)
    stop("Must have at least 2 categories")
  rgeom(reps, prob = prob[1]) +1
}

draws2get1ofEach <- function( prob, reps, fullOut=FALSE){
  ## randomly spin til we get one of each category
  ## returns the number of spins needed
  ## if fullOut = TRUE, gives info to trace the critical steps
  ##  of each sequence of spins: Category seen, and
  ##  spins to the next new category
  nCat <- length(prob)
  prob <-  prob/sum(prob)
  reps <- as.numeric(reps)
  if(nCat < 2)
    stop("Must have at least 2 categories")
  tempDraw <- matrix(sample(1:nCat, reps * nCat * round(4/min(prob)), prob = prob, replace=TRUE), nrow = reps)
  noDups <- !t( apply(tempDraw, 1, duplicated))
  check <- any(apply(noDups, 1, function(x) sum(as.numeric(x)) < nCat))
  if(check)
    stop("Whoops, we missed some large runs")
  nDraws <- apply(noDups, 1, function(x) max(which(x)))
  if(!fullOut)
    return(nDraws)
  draws <-  t(sapply(1:reps, function(i) tempDraw[i,][noDups[i,]]))
  cols <- t(apply(noDups, 1, which))
  return(data.frame(nDraws, draws, cols))
}

reconstructSpins <- function( output, prob){
  ## uses fullOut from 'draws2get1ofEach()' or a count from 'draws2get1()'
  ## and reconstructs a history of spins.  With more than 2 categories,
  ## the sequence is not unique, as intermediate draws could have come
  ## from any of several sequences which have the same 'new categories'
  ## in the same positions, but differ in the "filler" spots.
  nCat <-  length(prob)
  prob <-  prob/sum(prob)
  output <- unlist(output)
  if(length(output) == 1){
    if(output == 1) return(1)
    if(nCat == 2) return(rep(2:1, c(output-1,1)))
    return(c(sample(2:nCat, output - 1, prob[-1], replace=TRUE), 1))
    ## returns output for 'draws2get1()'
  }
  ## else we're doing 'draws2get1ofEach()

  ## output <- unlist(output[1,])

  if(abs(nCat - (length(output) -1) / 2) > .01)
    stop("Output dimensions don't match length(prob)")

  catObs <- output[ 2:(nCat + 1)]
  trial <-  output[ -(1 + 0:nCat)]

  val <- rep(catObs[1:2], c(trial[2]-trial[1],1))
  ## uses first 2 categories observed
  if(nCat > 2){
    for(ndx in 3:nCat){
      covered <- catObs[1:(ndx-1)]
      ## print(covered)
      ## print( trial[ndx] - trial[ndx-1] )
      val <-  c(val, sample(covered, trial[ndx] - trial[ndx-1] -1,  prob = prob[covered], replace = TRUE),
                catObs[ndx])
    }
  }
  val
}

gettysbrg <- "   Four score and seven years ago our fathers brought forth on this continent a new nation, conceived in liberty, and dedicated to the proposition that all men are created equal.

    Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battlefield of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.

    But, in a larger sense, we can not dedicate, we can not consecrate, we can not hallow this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us—that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion - that we here highly resolve that these dead shall not have died in vain - that this nation, under God, shall have a new birth of freedom - and that government of the people, by the people, for the people, shall not perish from the earth.
"

joke <- "Four college friends were so confident that the weekend before finals, they decided to go to a
city several hours away to party with some friends. They had a great time. However, after all
the partying, they slept all day Sunday and didnt make it back to school until early Monday
morning.
Rather than taking the final then, they decided to find their professor after the final and explain
to him why they missed it.
They explained that they had gone to the city for the weekend with the plan to come back
and study but, unfortunately, they had a flat tire on the way back, didnt have a spare, and
couldnt get help for a long time. As a result, they missed the final.
The professor thought it over and then agreed they could make up the final the following day.
The four were elated and relieved.
They studied that night and went in the next day at the time the professor had told them.
The professor placed them in separate rooms and handed each of them a test booklet, and told them to
begin.
They looked at the first problem, worth five points. It was something simple about exploratory
data analysis. 'Cool,' they thought at the same time, each one in his separate room. 'This is
going to be easy.'
Each finished the problem and then turned the page. On the second page was written:
For 95 points: Which tire?"

cleanText <- function(atxt){
 ## function to take a string of text, convert to just letters, and split into words.
  atxt <- gsub("[:;(),.?'-]", "", trimws(atxt)) ## trim white space, remove punctuation
  atxt <- gsub("[[:space:]]"," ", atxt)        ## convert linefeed & tab to space
  atxt <- stringi::stri_enc_toascii(atxt)     ## convert all to  ascii
  atxt <- gsub('[\032]'," ",atxt)             ## removes non-ascii characters
  words <- unlist(strsplit(atxt, " "))      ## split into words
  words <- words[nchar(words)>0]            ## remove NA's
  shorts <- which((nchar(words) == 1) &( words !="a" & words !="I"))
   if(length(shorts)){
     for(wd in shorts){
        words[wd-1] <- paste(words[wd + -1:0], collapse = "")
     }
      words <- words[-shorts]
   }
  data.frame(words, count =nchar(words), hasAnE = grepl("e", words)+0 )
}

helpPopup <- function(title, content = "Hello World", placement=c('right', 'top', 'left', 'bottom'),
                      trigger=c('click', 'hover', 'focus', 'manual')) {

  tagList(
    singleton(tags$head(tags$script("$(function() { $(\"[data-toggle='popover']\").popover(); })"))),
    tags$a(href = "#", `data-toggle` = "popover", title = title, `data-content` = content,
           `data-placement` = match.arg(placement, several.ok=TRUE)[1],
           `data-trigger` = match.arg(trigger, several.ok=TRUE)[1],
               tags$i("Help?")) ## class="glyphicon-question-sign"))
  )
}

bootContent <- 'Start with a known "population". In this case our values are textbook
costs for one semester at MSU in tens of dollars.'
#   <ul>
#   <li> Click [Sample] and watch values get pulled from the population to the sample below.<BR>
#   The population then disappears, because in real data collection we never know  it.<BR>
#   Bootstrap resamples are based on just one sample.
# </li>
#   <li> Click [1 Resample].  You will see a re-sample of size 8 get selected
# from the original sample <b>with replacement</b>.  Some of the items in the
# sample get chosen multiple times (giving a darker background), and
# some not at all (background stays white).  The mean of
# the resample is shown as a point in the plot. It disappears after a few seconds.
# Click [slower] or [faster] to vary the speed.
# </li>
#   <li> Once you understand how one resample is picked,  click one
# of the "Many Resamples" options. Only the resample means are shown.
# A confidence interval appears on the plot as red boundary
# lines.  It is computed by the "percentile" method with the given
# percentage of means in the middle and equal numbers of points above and below.
# </li>
#   </ul>'