Based on the data set state.x77, test the normality of murder rate with one step ks-test.
# import data
state<-data.frame(state.x77)
# KS test
ks.test(state$Murder,'pnorm',mean(state$Murder),sd(state$Murder))
## Warning in ks.test(state$Murder, "pnorm", mean(state$Murder), sd(state$Murder)):
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: state$Murder
## D = 0.10955, p-value = 0.5859
## alternative hypothesis: two-sided
# Lilliefors test
library(nortest)
lillie.test(state$Murder)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: state$Murder
## D = 0.10955, p-value = 0.1403
The p-value of the Lilliefors (K-S) test is larger than 0.05, so there is no evidence that the murder rate depart significantly from normality.
In the ons-mcz-2015-subset3 data set, another variable MCZ_13 (Overall, how satisfied are you with the balance between the time spent on paid job and the time spent on other aspects of life? ) is included. With step-by-step and one step procedure, produce one sample k-s test to test the normality of MCZ_13 and two sample k-s test of MCZ_13 and RECODEDHealth, both of them need to be visualized.
#univariate analysis
ons_mcz <- read.csv('dataset-ons-mcz-2015-subset3.csv')
ons_mcz[ons_mcz==98|ons_mcz==99]<-NA
library(pastecs)
## Warning: 程辑包'pastecs'是用R版本4.1.3 来建造的
round(stat.desc(ons_mcz$MCZ_13), digits = 3)
## nbr.val nbr.null nbr.na min max range
## 1097.000 5.000 951.000 0.000 10.000 10.000
## sum median mean SE.mean CI.mean.0.95 var
## 7573.000 7.000 6.903 0.067 0.132 4.930
## std.dev coef.var
## 2.220 0.322
hist(ons_mcz$MCZ_13)# not show the normality
sample3<-ons_mcz$MCZ_13
cdf3<-ecdf(sample3)
# step by step
D2 = -Inf
index2<-order(unique(na.omit(sample3)))
minMax2<-unique(na.omit(sample3))[index2]#unique and ordered
for (i in minMax2) {
x <- max(abs(cdf3(i) - pnorm(i, mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE))),abs(cdf3(i-1) - pnorm(i, mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE))))
if (x >D2)
D2 <- x
}
D2
## [1] 0.1414447
v0 <- minMax2[which( abs(cdf3(minMax2) - pnorm(minMax2,
mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE)))==D2)]
v1 <- minMax2[which(abs(cdf3(minMax2-1) - pnorm(minMax2,
mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE)))
==D2)]
v0
## integer(0)
v1
## [1] 8
dd <- data.frame(x = sample3)
library(ggplot2)
## Warning: 程辑包'ggplot2'是用R版本4.1.3 来建造的
ggplot(dd, aes(x)) +
stat_ecdf(aes(colour = "Empirical")) +
stat_function(fun = pnorm, args = list(mean = mean(sample3,na.rm=TRUE), sd = sd(sample3,na.rm=TRUE)),
aes(colour = "Theoretical")) +xlab("Sample") +
ylab("ECDF") +
annotate(
"segment",
x = v1, xend = v1,
y = cdf3(v1-1),
yend = pnorm(v1, mean(sample3,na.rm=TRUE), sd(sample3,na.rm=TRUE)),
size = 1,col='yellow'
)
## Warning: Removed 951 rows containing non-finite values (stat_ecdf).
test_statistic<-D2
test_statistic
## [1] 0.1414447
critical_value<-1.63/sqrt(length(sample3))
critical_value
## [1] 0.03601825
# one step
ks.test(sample3,'pnorm',mean(sample3,na.rm=TRUE),sd(sample3,na.rm=TRUE))
## Warning in ks.test(sample3, "pnorm", mean(sample3, na.rm = TRUE), sd(sample3, :
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: sample3
## D = 0.14144, p-value < 2.2e-16
## alternative hypothesis: two-sided
Because the test statistic is larger than the critical value (0.1414447>0.03601825) and the p-value <0.01 in other way, reject H0 that the MCZ_13 data follow normal distribution.
#step by step
sample2<-ons_mcz$RECODEDHealth
cdf2<-ecdf(sample2)
ds<-data.frame(sample=c(sample3,sample2),group=c(rep("sample3", length(sample3)), rep("sample2", length(sample2))))
index3<-order(unique(na.omit(ds$sample)))
minMax3<-unique(na.omit(ds$sample))[index3]#unique and ordered
w <- minMax3[which.max( abs(cdf3(minMax3) - cdf2(minMax3)))]
w
## [1] 2
library(ggplot2)
ggplot(ds,aes(x=sample, group=group,color=group))+
stat_ecdf(size=1) +
theme_bw(base_size = 28) +
theme(legend.position ="top") +
xlab("Sample") +
ylab("ECDF") +
#geom_line(size=1) +
geom_segment(aes(x = w, y = cdf3(w), xend = w, yend = cdf2(w)),
linetype = "dashed", color = "red") +
geom_point(aes(x = w , y= cdf3(w)), color="red", size=8) +
geom_point(aes(x = w , y= cdf2(w)), color="red", size=8) +
theme(legend.title=element_blank())
## Warning: Removed 992 rows containing non-finite values (stat_ecdf).
par(mfrow=c(2,2))
t_statistic<-max( abs(cdf3(w) - cdf2(w)),na.rm=TRUE)
t_statistic
## [1] 0.9626253
critical_value<-1.63*sqrt(1/length(sample3)+1/length(sample2))
critical_value
## [1] 0.0509375
#one step
ks.test(sample2, sample3)
## Warning in ks.test(sample2, sample3): p-value will be approximate in the
## presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: sample2 and sample3
## D = 0.96263, p-value < 2.2e-16
## alternative hypothesis: two-sided
Because the test statistic is larger than the critical value (0.9626253>0.0509375) and the p-value <0.01 in other way, reject H0 that MCZ_13 and RECODEDHealth come from the same distribution.