Exercise1

Based on the data set state.x77, test the normality of murder rate with one step ks-test.

# import data
state<-data.frame(state.x77)

# KS test
ks.test(state$Murder,'pnorm',mean(state$Murder),sd(state$Murder))
## Warning in ks.test(state$Murder, "pnorm", mean(state$Murder), sd(state$Murder)):
## ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  state$Murder
## D = 0.10955, p-value = 0.5859
## alternative hypothesis: two-sided
# Lilliefors test
library(nortest)
lillie.test(state$Murder)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  state$Murder
## D = 0.10955, p-value = 0.1403

The p-value of the Lilliefors (K-S) test is larger than 0.05, so there is no evidence that the murder rate depart significantly from normality.

Exercise 2

In the ons-mcz-2015-subset3 data set, another variable MCZ_13 (Overall, how satisfied are you with the balance between the time spent on paid job and the time spent on other aspects of life? ) is included. With step-by-step and one step procedure, produce one sample k-s test to test the normality of MCZ_13 and two sample k-s test of MCZ_13 and RECODEDHealth, both of them need to be visualized.

one sample

#univariate analysis
ons_mcz <- read.csv('dataset-ons-mcz-2015-subset3.csv')
ons_mcz[ons_mcz==98|ons_mcz==99]<-NA
library(pastecs)
## Warning: 程辑包'pastecs'是用R版本4.1.3 来建造的
round(stat.desc(ons_mcz$MCZ_13), digits = 3)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##     1097.000        5.000      951.000        0.000       10.000       10.000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##     7573.000        7.000        6.903        0.067        0.132        4.930 
##      std.dev     coef.var 
##        2.220        0.322
hist(ons_mcz$MCZ_13)# not show the normality

sample3<-ons_mcz$MCZ_13
cdf3<-ecdf(sample3)
# step by step
D2 = -Inf   
index2<-order(unique(na.omit(sample3)))
minMax2<-unique(na.omit(sample3))[index2]#unique and ordered
for (i in minMax2) {
  x <- max(abs(cdf3(i) - pnorm(i, mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE))),abs(cdf3(i-1) - pnorm(i, mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE))))
  if (x >D2) 
    D2 <- x
}
D2
## [1] 0.1414447
v0 <- minMax2[which( abs(cdf3(minMax2) - pnorm(minMax2, 
                      mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE)))==D2)] 
v1 <- minMax2[which(abs(cdf3(minMax2-1) - pnorm(minMax2, 
                      mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE)))
              ==D2)] 
v0
## integer(0)
v1
## [1] 8
dd <- data.frame(x = sample3)
library(ggplot2)
## Warning: 程辑包'ggplot2'是用R版本4.1.3 来建造的
ggplot(dd, aes(x)) +
     stat_ecdf(aes(colour = "Empirical")) +
     stat_function(fun = pnorm, args = list(mean = mean(sample3,na.rm=TRUE),                    sd = sd(sample3,na.rm=TRUE)),
                   aes(colour = "Theoretical")) +xlab("Sample") +
     ylab("ECDF") +
     annotate(
       "segment",
       x = v1, xend = v1,
       y = cdf3(v1-1), 
       yend = pnorm(v1, mean(sample3,na.rm=TRUE), sd(sample3,na.rm=TRUE)),
       size = 1,col='yellow'
     )
## Warning: Removed 951 rows containing non-finite values (stat_ecdf).

test_statistic<-D2
test_statistic
## [1] 0.1414447
critical_value<-1.63/sqrt(length(sample3))
critical_value
## [1] 0.03601825
# one step
   ks.test(sample3,'pnorm',mean(sample3,na.rm=TRUE),sd(sample3,na.rm=TRUE))
## Warning in ks.test(sample3, "pnorm", mean(sample3, na.rm = TRUE), sd(sample3, :
## ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  sample3
## D = 0.14144, p-value < 2.2e-16
## alternative hypothesis: two-sided

Because the test statistic is larger than the critical value (0.1414447>0.03601825) and the p-value <0.01 in other way, reject H0 that the MCZ_13 data follow normal distribution.

two sample

#step by step
sample2<-ons_mcz$RECODEDHealth
cdf2<-ecdf(sample2)

ds<-data.frame(sample=c(sample3,sample2),group=c(rep("sample3", length(sample3)), rep("sample2", length(sample2))))
index3<-order(unique(na.omit(ds$sample)))
minMax3<-unique(na.omit(ds$sample))[index3]#unique and ordered
   
w <- minMax3[which.max( abs(cdf3(minMax3) - cdf2(minMax3)))] 
w
## [1] 2
library(ggplot2)
ggplot(ds,aes(x=sample, group=group,color=group))+
     stat_ecdf(size=1) +
     theme_bw(base_size = 28) +
     theme(legend.position ="top") +
     xlab("Sample") +
     ylab("ECDF") +
     #geom_line(size=1) +
     geom_segment(aes(x = w, y = cdf3(w), xend = w, yend = cdf2(w)),
                  linetype = "dashed", color = "red") +
     geom_point(aes(x = w , y= cdf3(w)), color="red", size=8) +
     geom_point(aes(x = w , y= cdf2(w)), color="red", size=8) +
     theme(legend.title=element_blank())
## Warning: Removed 992 rows containing non-finite values (stat_ecdf).

     par(mfrow=c(2,2))
t_statistic<-max( abs(cdf3(w) - cdf2(w)),na.rm=TRUE)
t_statistic
## [1] 0.9626253
critical_value<-1.63*sqrt(1/length(sample3)+1/length(sample2))
critical_value
## [1] 0.0509375
#one step
ks.test(sample2, sample3)
## Warning in ks.test(sample2, sample3): p-value will be approximate in the
## presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  sample2 and sample3
## D = 0.96263, p-value < 2.2e-16
## alternative hypothesis: two-sided

Because the test statistic is larger than the critical value (0.9626253>0.0509375) and the p-value <0.01 in other way, reject H0 that MCZ_13 and RECODEDHealth come from the same distribution.