# # Goodness-of-Fit test for loss distributions # #====================================================================================== X <- c(300, 400, 2800, 4500, 4900, 5000, 7700, 9600, 10400, 10600, 11200, 11400, 12200, 12900, 13400, 14100, 15500, 19300, 19400, 22100, 24800, 29600, 32200, 32500, 33700, 34300, 37300, 39500, 39900, 41200, 42800, 45900, 49200, 54600, 56700, 57200, 57500, 59100, 60800, 62500, 63600, 66400, 66900, 68100, 68900, 71100, 72100, 79900, 80700, 83200, 84500, 84600, 86600, 88600, 91700, 96600, 96900, 106800, 107800, 111900, 113000, 113200, 115000, 117100, 119300, 122000, 123100, 126600, 127300, 127600, 127900, 128000, 131300, 132900, 134300, 134700, 135800, 146100, 150300, 171800, 173200, 177700, 183000, 183300, 190100, 209400, 212900, 225100, 226600, 233200, 234200, 244900, 253400, 261300, 261800, 273300, 276200, 284300, 316300, 322600, 343400, 350700, 395800, 406900, 423200, 437900, 442700, 457800, 463000, 469300, 469600, 544300, 552700, 566700, 571800, 596500, 737700, 766100, 846100, 852700, 920300, 981100, 988300, 1078800, 1117600, 1546800, 2211000, 2229700, 3961000, 4802200) Fn <- ecdf(X) #- Fn becomes an function plot( ecdf(X) ) length(X) mean(X) var(X) sd(X) t=seq(1,5000000, 100) plot(t, Fn(t), type="l" ) lines(t,pexp(t,1/mean(X)), col="red") #--- MLE ------------------ library(MASS) #- Exp Exp.hat <- fitdistr(X, "exponential" )$estimate plot(t, Fn(t), type="l" ) lines(t, pexp(t, Exp.hat), col="red" ) #- Gamma fitdistr(X, "gamma", start=list(shape=1, scale=1))$estimate #- Gamma MLE needs starting point M1 <- mean(X) M2 <- mean(X^2) b.hat <- (M2 - M1^2)/ M1 #MME a.hat <- M1 / b.hat Gam.hat <- fitdistr(X, "gamma", start=list(shape=a.hat, scale=b.hat))$estimate Gam.hat plot(t, Fn(t), type="l" ) lines(t, pgamma(t, shape=Gam.hat[1], scale=Gam.hat[2]), col="blue" ) #- LogNormal Ln.hat <- fitdistr(X, "lognormal")$estimate Ln.hat plot(t, Fn(t), type="l" ) lines(t, plnorm(t, Ln.hat[1], Ln.hat[2]), col="green" ) #- All Three plot(t, Fn(t), type="l" ) lines(t, plnorm(t, Ln.hat[1], Ln.hat[2]), col="green" ) lines(t, pgamma(t, shape=Gam.hat[1], scale=Gam.hat[2]), col="blue" ) lines(t, pexp(t, Exp.hat), col="red" ) #--- K-S test --------------------------------------- KS1 <- Fn(t) - pexp(t, 1/mean(X)) KS2 <- Fn(t) - pgamma(t, shape=Gam.hat[1], scale=Gam.hat[2]) KS3 <- Fn(t) - plnorm(t, Ln.hat[1], Ln.hat[2]) max( abs(KS1) ) max( abs(KS2) ) max( abs(KS3) ) plot(KS1, type="l", col="red"); abline(h=0) lines(KS2, col="blue") lines(KS3, col="green") #- Built in KS test (p-value can't be used since parameters are estimated) - ks.test(X, "pexp", 1/mean(X)) #- same as KS1 ks.test(X, "pgamma", shape=Gam.hat[1], scale=Gam.hat[2]) ks.test(X, "plnorm", Ln.hat[1], Ln.hat[2]) #--- MC simulation to get CV for K-S test n <- length(X) Ln.hat t <- seq(1,5000000, 100) KS1.sim <- 0 Exp.hat.sim <- matrix(0, 5000,1) for (i in 1:5000) { X.sim <- rexp(n, 1/mean(X)) Fn.sim <- ecdf(X.sim) Exp.hat.sim[i]<- fitdistr(X.sim, "exponential")$estimate KS1.sim[i] <- max( abs( Fn.sim(t) - pexp(t, Exp.hat.sim[i]) )) } hist(KS1.sim) KS2.sim <- 0 Gam.hat.sim <- matrix(0, 5000,2) for (i in 1:5000) { X.sim <- rgamma(n, shape=Gam.hat[1], scale=Gam.hat[2]) Fn.sim <- ecdf(X.sim) M1 <- mean(X.sim) M2 <- mean(X.sim^2) b.hat <- (M2 - M1^2)/ M1 #MME a.hat <- M1 / b.hat Gam.hat.sim[i, ] <- fitdistr(X.sim, "gamma", start=list(shape=a.hat, scale=b.hat))$estimate KS2.sim[i] <- max( abs( Fn.sim(t) - pgamma(t, shape=Gam.hat.sim[i,1], scale=Gam.hat.sim[i,2]) )) } hist(KS2.sim) KS3.sim <- 0 Ln.hat.sim <- matrix(0, 5000,2) for (i in 1:5000) { X.sim <- rlnorm(n, 11.587475, 1.603259) Fn.sim <- ecdf(X.sim) Ln.hat.sim[i, ] <- fitdistr(X.sim, "lognormal")$estimate KS3.sim[i] <- max( abs( Fn.sim(t) - plnorm(t, Ln.hat.sim[i,1], Ln.hat.sim[i,2]) )) #plot(t, Fn.sim(t), type="l" ) #lines(t, plnorm(t, Ln.hat.sim[i,1], Ln.hat.sim[i,2]), col="green" ) } hist(KS3.sim) sort(KS1.sim)[4750] sort(KS2.sim)[4750] sort(KS3.sim)[4750] sort(KS1.sim)[4500] sort(KS2.sim)[4500] sort(KS3.sim)[4500] #- Q-Q plots ----------------------------------------- n <- length(X) t <- ((0:n-1)+.5)/n s X1 <- sort(X) qqplot(qexp(t, 1/mean(X)), X1, main="Exponential Q-Q Plot", ylab="Sample Quantiles") abline(0,1) qqplot(qgamma(t, shape=Gam.hat[1], scale=Gam.hat[2] ), X1, main="Gamma Q-Q Plot", ylab="Sample Quantiles") abline(0,1) qqplot(qlnorm(t, Ln.hat[1], Ln.hat[2] ), X1, main="Lognormal Q-Q Plot", ylab="Sample Quantiles") abline(0,1) #----------------------------------------------------------- #--- Stable Distribution ----------------------------------- #-- alpha=2 is N(0,sqrt(2)) -- library("stabledist") x <- seq(-10,10,.1) plot( x, dstable(x, alpha=2,beta=0)) lines(x, dnorm(x, 0, sqrt(2)), col='red') #-- alpha=1 is Cauchy -- x <- seq(-10,10,.1) plot(x, dstable(x, alpha=1,beta=0)) lines(x, dcauchy(x), col='red') #-- alpha=? -- x <- seq(-10,10,.1) plot(x, dstable(x, alpha=1.8, beta=0)) lines(x, dnorm(x, 0, sqrt(2)), col='red') #--- 1. GOF test with Kolmogolov-Smirnov (known null) ----------- #-- limiting distribution under null is sup of |BB(F(t))| D1 <- D2 <- D3 <- 0 for (itt in 1:1000){ X1 <- rnorm(100, 0, 1) X2 <- rnorm(1000, 0, 1) X3 <- rnorm(5000, 0, 1) D1[itt] <- ks.test( X1, "pnorm", 0, 1)$statistic D2[itt] <- ks.test( X2, "pnorm", 0, 1)$statistic D3[itt] <- ks.test( X3, "pnorm", 0, 1)$statistic } x <- seq(0,5,.1) E1 <- ecdf( sqrt(100 )*D1 ); sort(sqrt(100 )*D1)[950] #1.323159 E2 <- ecdf( sqrt(1000)*D2 ); sort(sqrt(1000)*D2)[950] E3 <- ecdf( sqrt(5000)*D3 ); sort(sqrt(5000)*D3)[950] plot( x, E1(x), type='l') lines(x, E2(x), col='red') lines(x, E3(x), col='green') #--- 2. Compare limiting distribution for Normal vs Exp ------ # for known null and location-scale null D1 <- D2 <- D3 <- D4 <- 0 for (itt in 1:1000){ X1 <- rnorm(1000, 2, 3) X2 <- rexp(1000, .5) D1[itt] <- ks.test( X1, "pnorm", 2, 3 )$statistic D2[itt] <- ks.test( X1, "pnorm", mean(X1), sd(X1))$statistic D3[itt] <- ks.test( X2, "pexp", .5 )$statistic D4[itt] <- ks.test( X2, "pexp", 1/mean(X2) )$statistic } x <- seq(0,5,.1) E1 <- ecdf( sqrt(n)*D1 ); sort(sqrt(n)*D1)[950] #1.323159 E2 <- ecdf( sqrt(n)*D2 ); sort(sqrt(n)*D2)[950] E3 <- ecdf( sqrt(n)*D3 ); sort(sqrt(n)*D3)[950] E4 <- ecdf( sqrt(n)*D4 ); sort(sqrt(n)*D4)[950] plot( x, E1(x), type='l') lines(x, E2(x), col='red') lines(x, E3(x), col='green') lines(x, E4(x), col='blue') #black and green is same because they are both known null, but not red and blue #--- 3. Comparing limting distribution of ------------------------- # known null vs location-scale null for Nomral library("stabledist") n <- 1000 D1 <- D2 <- D3 <- 0 for (itt in 1:1000){ X1 <- rnorm(n, 0, 1) X2 <- rnorm(n, 2, 3) D1[itt] <- ks.test( X1, "pnorm", 0, 1)$statistic D2[itt] <- ks.test( X1, "pnorm", mean(X1), sd(X1))$statistic D3[itt] <- ks.test( X2, "pnorm", mean(X2), sd(X2))$statistic } x <- seq(0,3,.1) E1 <- ecdf( sqrt(n)*D1 ); sort(sqrt(n)*D1)[950] # 1.34146 E2 <- ecdf( sqrt(n)*D2 ); sort(sqrt(n)*D2)[950] # 0.90658 E3 <- ecdf( sqrt(n)*D3 ); sort(sqrt(n)*D3)[950] # 0.92042 plot( x, E1(x), type='l') lines(x, E2(x), col='red') lines(x, E3(x), col='green') #--- 3. Rejecting stable(a=1.9) with location-scale null ----------- library("stabledist") n <- 1000 D1 <- D2 <- 0 for (itt in 1:1000){ X1 <- rnorm(n, 2, 3) X2 <- rstable(n, alpha=1.9, beta=0) D1[itt] <- ks.test( X1, "pnorm", mean(X1), sd(X1))$statistic D2[itt] <- ks.test( X2, "pnorm", mean(X2), sd(X2))$statistic } x <- seq(0,5,.1) E1 <- ecdf( sqrt(n)*D1 ); CV05=sort(sqrt(n)*D1)[950] E2 <- ecdf( sqrt(n)*D2 ); power=sum(sqrt(n)*D2>CV05)/1000 plot( x, E1(x), type='l') lines(x, E2(x), col='red') abline(v=CV05) CV05 power