R Under development (unstable) (2024-02-13 r85898 ucrt) -- "Unsuffered Consequences" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > options(na.action=na.exclude) # preserve missings > options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type > library(survival) > > # > # Reproduce example 1 in the SAS lifereg documentation > # > > # this fit doesn't give the same log-lik that they claim > fit1 <- survreg(Surv(time, status) ~ I(1000/(273.2+temp)), imotor, + subset=(temp>150), dist='lognormal') > summary(fit1) Call: survreg(formula = Surv(time, status) ~ I(1000/(273.2 + temp)), data = imotor, subset = (temp > 150), dist = "lognormal") Value Std. Error z p (Intercept) -10.471 2.772 -3.78 0.00016 I(1000/(273.2 + temp)) 8.322 1.284 6.48 9.1e-11 Log(scale) -0.504 0.183 -2.75 0.00596 Scale= 0.604 Log Normal distribution Loglik(model)= -145.9 Loglik(intercept only)= -155 Chisq= 18.3 on 1 degrees of freedom, p= 1.9e-05 Number of Newton-Raphson Iterations: 6 n= 30 > > # This one, with the loglik on the transformed scale (the inappropriate > # scale, Ripley & Venables would argue) does agree. > # All coefs are of course identical. > fit2 <- survreg(Surv(log(time), status) ~ I(1000/(273.2+temp)), imotor, + subset=(temp>150), dist='gaussian') > > > # Give the quantile estimates, which is the lower half of "output 48.1.5" > # in the SAS 9.2 manual > > pp1 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9), + type='quantile', se=T) > pp2 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9), + type='uquantile', se=T) > pp1 $fit [,1] [,2] [,3] [1,] 12033.185 26095.677 56592.20 [2,] 4536.877 9838.864 21336.98 $se.fit [,1] [,2] [,3] [1,] 5482.338 11359.450 26036.917 [2,] 1443.072 2901.155 7172.343 > > temp130 <- matrix(0, nrow=3, ncol=6) > temp130[,1] <- pp1$fit[1,] > temp130[,2] <- pp1$se.fit[1,] > temp130[,3] <- pp2$fit[1,] > temp130[,4] <- pp2$se.fit[1,] > temp130[,5] <- exp(pp2$fit[1,] - 1.64*pp2$se.fit[1,]) > temp130[,6] <- exp(pp2$fit[1,] + 1.64*pp2$se.fit[1,]) > dimnames(temp130) <- list(c("p=.1", "p=.2", "p=.3"), + c("Time", "se(time)", "log(time)", "se[log(time)]", + "lower 90", "upper 90")) > print(temp130) Time se(time) log(time) se[log(time)] lower 90 upper 90 p=.1 12033.18 5482.338 9.395424 0.4556015 5700.089 25402.68 p=.2 26095.68 11359.450 10.169525 0.4353001 12779.950 53285.37 p=.3 56592.20 26036.917 10.943626 0.4600796 26611.422 120349.71 > > # A set of examples, copied from the manual pages of SAS procedure > # "reliability", which is part of their QC product. > # > > color <- c("black", "red", "green", "blue", "magenta", "red4", + "orange", "DarkGreen", "cyan2", "DarkViolet") > palette(color) > pdf(file='reliability.pdf') > > # > # Insulating fluids example > # > > # Adding a -1 to the fit just causes the each group to have it's own > # intercept, rather than a global intercept + constrasts. The strata > # statement allows each to have a separate scale > ffit <- survreg(Surv(time) ~ factor(voltage) + strata(voltage) -1, ifluid) > > # Get predicted quantiles at each of the voltages > # By default predict() would give a line of results for each observation, > # I only want the unique set of x's, i.e., only 4 cases > uvolt <- sort(unique(ifluid$voltage)) #the unique levels > plist <- c(1, 2, 5, 1:9 *10, 95, 99)/100 > pred <- predict(ffit, type='quantile', p=plist, + newdata=data.frame(voltage=uvolt)) > tfun <- function(x) log(-log(1-x)) > > matplot(t(pred), tfun(plist), type='l', log='x', lty=1, + col=1:4, yaxt='n', + xlab="Predicted", ylab="Quantile") > axis(2, tfun(plist), format(100*plist), adj=1) > > kfit <- survfit(Surv(time) ~ voltage, ifluid, type='fleming') #KM fit > for (i in 1:4) { + temp <- kfit[i] + points(temp$time, tfun(1-temp$surv), col=i, pch=i) + } > > # Now a table > temp <- array(0, dim=c(4,4,4)) #4 groups by 4 parameters by 4 stats > temp[,1,1] <- ffit$coef # "EV Location" in SAS manual > temp[,2,1] <- ffit$scale # "EV scale" > temp[,3,1] <- exp(ffit$coef) # "Weibull Scale" > temp[,4,1] <- 1/ffit$scale # "Weibull Shape" > > temp[,1,2] <- sqrt(diag(ffit$var))[1:4] #standard error > temp[,2,2] <- sqrt(diag(ffit$var))[5:8] * ffit$scale > temp[,3,2] <- temp[,1,2] * temp[,3,1] > temp[,4,2] <- temp[,2,2] / (temp[,2,1])^2 > > temp[,1,3] <- temp[,1,1] - 1.96*temp[,1,2] #lower conf limits > temp[,1,4] <- temp[,1,1] + 1.96*temp[,1,2] # upper > # log(scale) is the natural parameter, in which the routine did its fitting > # and on which the std errors were computed > temp[,2, 3] <- exp(log(ffit$scale) - 1.96*sqrt(diag(ffit$var))[5:8]) > temp[,2, 4] <- exp(log(ffit$scale) + 1.96*sqrt(diag(ffit$var))[5:8]) > > temp[,3, 3:4] <- exp(temp[,1,3:4]) > temp[,4, 3:4] <- 1/temp[,2,4:3] > > dimnames(temp) <- list(uvolt, c("EV Location", "EV Scale", "Weibull scale", + "Weibull shape"), + c("Estimate", "SE", "lower 95% CI", "uppper 95% CI")) > print(aperm(temp, c(2,3,1)), digits=5) , , 26 Estimate SE lower 95% CI uppper 95% CI EV Location 6.86249 1.10404 4.69857 9.0264 EV Scale 1.83423 0.96114 0.65677 5.1227 Weibull scale 955.74665 1055.18620 109.78973 8320.0103 Weibull shape 0.54519 0.28568 0.19521 1.5226 , , 30 Estimate SE lower 95% CI uppper 95% CI EV Location 4.35133 0.30151 3.76037 4.9423 EV Scale 0.94446 0.22544 0.59156 1.5079 Weibull scale 77.58159 23.39176 42.96420 140.0911 Weibull shape 1.05881 0.25274 0.66318 1.6904 , , 34 Estimate SE lower 95% CI uppper 95% CI EV Location 2.50326 0.31476 1.88632 3.1202 EV Scale 1.29732 0.22895 0.91796 1.8334 Weibull scale 12.22222 3.84707 6.59509 22.6506 Weibull shape 0.77082 0.13603 0.54542 1.0894 , , 38 Estimate SE lower 95% CI uppper 95% CI EV Location 0.00092629 0.27318 -0.53450 0.53635 EV Scale 0.73367610 0.20380 0.42565 1.26460 Weibull scale 1.00092672 0.27343 0.58596 1.70976 Weibull shape 1.36299929 0.37861 0.79077 2.34933 > > rm(temp, uvolt, plist, pred, ffit, kfit) > > ##################################################################### > # Turbine cracks data > crack2 <- with(cracks, data.frame(day1=c(NA, days), day2=c(days, NA), + n=c(fail, 167-sum(fail)))) > cfit <- survreg(Surv(day1, day2, type='interval2') ~1, + dist='weibull', data=crack2, weight=n) > > summary(cfit) Call: survreg(formula = Surv(day1, day2, type = "interval2") ~ 1, data = crack2, weights = n, dist = "weibull") Value Std. Error z p (Intercept) 7.6880 0.0744 103.30 < 2e-16 Log(scale) -0.3953 0.0987 -4.01 6.2e-05 Scale= 0.674 Weibull distribution Loglik(model)= -309.6 Loglik(intercept only)= -309.6 Number of Newton-Raphson Iterations: 5 n= 9 > #Their output also has Wiebull scale = exp(cfit$coef), shape = 1/(cfit$scale) > > # Draw the SAS plot > # The "type=fleming" argument reflects that they estimate hazards rather than > # survival, and forces a Nelson-Aalen hazard estimate > # > plist <- c(1, 2, 5, 1:8 *10)/100 > plot(qsurvreg(plist, cfit$coef, cfit$scale), tfun(plist), log='x', + yaxt='n', type='l', + xlab="Weibull Plot for Time", ylab="Percent") > axis(2, tfun(plist), format(100*plist), adj=1) > > kfit <- survfit(Surv(day1, day2, type='interval2') ~1, data=crack2, + weight=n, type='fleming') > # Only plot point where n.event > 0 > # Why? I'm trying to match them. Personally, all should be plotted. > who <- (kfit$n.event > 0) > points(kfit$time[who], tfun(1-kfit$surv[who]), pch='+') > points(kfit$time[who], tfun(1-kfit$upper[who]), pch='-') > points(kfit$time[who], tfun(1-kfit$lower[who]), pch='-') > > text(rep(3,6), seq(.5, -1.0, length=6), + c("Scale", "Shape", "Right Censored", "Left Censored", + "Interval Censored", "Fit"), adj=0) > text(rep(9,6), seq(.5, -1.0, length=6), + c(format(round(exp(cfit$coef), 2)), + format(round(1/cfit$scale, 2)), + format(tapply(crack2$n, cfit$y[,3], sum)), "ML"), adj=1) > > # Now a portion of his percentiles table > # I don't get the same SE as SAS, I haven't checked out why. The > # estimates and se for the underlying Weibull model are the same. > temp <- predict(cfit, type='quantile', p=plist, se=T) > tempse <- sqrt(temp$se[1,]) > mat <- cbind(temp$fit[1,], tempse, + temp$fit[1,] -1.96*tempse, temp$fit[1,] + 1.96*tempse) > dimnames(mat) <- list(plist*100, c("Estimate", "SE", "Lower .95", "Upper .95")) > print(mat) Estimate SE Lower .95 Upper .95 1 98.47183 5.321676 88.04134 108.9023 2 157.59360 6.186246 145.46856 169.7186 5 295.17088 7.377034 280.71189 309.6299 10 479.31737 8.227489 463.19149 495.4432 20 794.55351 8.951640 777.00829 812.0987 30 1089.70373 9.404613 1071.27068 1108.1368 40 1387.95516 9.983696 1368.38712 1407.5232 50 1704.71015 10.888487 1683.36872 1726.0516 60 2057.23906 12.217230 2033.29329 2081.1848 70 2472.58593 14.035515 2445.07632 2500.0955 80 3006.43572 16.510569 2974.07501 3038.7964 > > # > # The cracks data has a particularly easy estimate, so use > # it to double check code > time <- c(crack2$day2[1], (crack2$day1 + crack2$day2)[2:8]/2, + crack2$day1[9]) > cdf <- cumsum(crack2$n)/sum(crack2$n) > all.equal(kfit$time, time) [1] TRUE > all.equal(kfit$surv, 1-cdf[c(1:8,8)]) [1] TRUE > rm(time, cdf, kfit) > > > ####################################################### > # > # Replacement of valve seats in diesel engines > # The input data has id, time, and an indicator of whether there was an > # event at that time: 0=no, 1=yes. No one has an event at their last time. > # The input data has two engines with dual failures: 328 loses 2 valves at > # time 653, and number 402 loses 2 at time 139. For each, fudge the first > # time to be .1 days earlier. > # > ties <- which(diff(valveSeat$time)==0 & diff(valveSeat$id)==0) > temp <- valveSeat$time > temp[ties] <- temp[ties] - .1 > n <- length(temp) > first <- !duplicated(valveSeat$id) > vtemp <- with(valveSeat, data.frame(id =id, + time1= ifelse(first, 0, c(0, temp[-n])), + time2= temp, status=status)) > > kfit <- survfit(Surv(time1, time2, status) ~1, vtemp, id=id) > > plot(kfit, fun='cumhaz', ylab="Sample Mean Cumulative Failures", xlab='Time') > title("Valve replacement data") > > # The summary.survfit function doesn't have an option for printing out > # cumulative hazards instead of survival --- need to add that > # so I just reprise the central code of print.summary.survfit > xx <- summary(kfit) > temp <- cbind(xx$time, xx$n.risk, xx$n.event, xx$cumhaz, + xx$std.chaz, -log(xx$upper), -log(xx$lower)) > dimnames(temp) <- list(rep("", nrow(temp)), + c("time", "n.risk", "n.event", "Cum haz", "std.err", + "lower 95%", "upper 95%")) > print(temp, digits=2) time n.risk n.event Cum haz std.err lower 95% upper 95% 61 41 1 0.024 0.024 0.0000 0.073 76 41 1 0.049 0.034 0.0000 0.117 84 41 1 0.073 0.041 0.0000 0.156 87 41 1 0.098 0.046 0.0057 0.192 92 41 1 0.122 0.051 0.0208 0.226 98 41 1 0.146 0.055 0.0373 0.259 120 41 1 0.171 0.059 0.0548 0.291 139 41 1 0.195 0.062 0.0732 0.322 139 41 1 0.220 0.073 0.0750 0.369 165 41 1 0.244 0.075 0.0954 0.398 166 41 1 0.268 0.077 0.1163 0.427 202 41 1 0.293 0.079 0.1376 0.455 206 41 1 0.317 0.088 0.1452 0.497 249 41 1 0.341 0.089 0.1675 0.524 254 41 1 0.366 0.090 0.1903 0.551 258 41 1 0.390 0.090 0.2133 0.577 265 41 1 0.415 0.091 0.2368 0.603 276 41 1 0.439 0.098 0.2479 0.641 298 41 1 0.463 0.110 0.2490 0.689 323 41 1 0.488 0.110 0.2734 0.714 326 41 1 0.512 0.110 0.2981 0.739 328 41 1 0.537 0.115 0.3124 0.774 344 41 1 0.561 0.115 0.3376 0.798 348 41 1 0.585 0.124 0.3430 0.842 349 41 1 0.610 0.124 0.3686 0.866 367 41 1 0.634 0.123 0.3945 0.890 377 41 1 0.659 0.132 0.4018 0.932 404 40 1 0.684 0.136 0.4189 0.965 408 40 1 0.709 0.140 0.4365 0.998 410 40 1 0.734 0.139 0.4631 1.022 449 40 1 0.759 0.143 0.4813 1.055 479 40 1 0.784 0.146 0.4998 1.087 497 40 1 0.809 0.149 0.5187 1.118 538 40 1 0.834 0.152 0.5380 1.150 539 40 1 0.859 0.155 0.5576 1.181 561 40 1 0.884 0.162 0.5696 1.220 563 40 1 0.909 0.164 0.5898 1.250 570 40 1 0.934 0.170 0.6029 1.287 573 40 1 0.959 0.169 0.6310 1.310 581 38 1 0.985 0.171 0.6532 1.341 586 34 1 1.014 0.174 0.6777 1.376 604 22 1 1.060 0.185 0.7009 1.446 621 17 1 1.119 0.208 0.7144 1.554 635 16 1 1.181 0.207 0.7795 1.618 640 16 1 1.244 0.226 0.8033 1.723 646 13 1 1.320 0.229 0.8775 1.809 653 9 1 1.432 0.252 0.9394 1.983 653 9 1 1.543 0.312 0.9201 2.238 > > # Note that I have the same estimates but different SE's than SAS. We are using > # a different estimator. It's a statistical argument as to which is > # better (one could defend both sides): SAS the more standard estimate found > # in the reliability literature, mine the estimate from statistics literature > rm(temp, kfit, xx) > > ###################################################### > # Turbine data, lognormal fit > turbine <- read.table('data.turbine', + col.names=c("time1", "time2", "n")) > > tfit <- survreg(Surv(time1, time2, type='interval2') ~1, turbine, + dist='lognormal', weights=n, subset=(n>0)) > > summary(tfit) Call: survreg(formula = Surv(time1, time2, type = "interval2") ~ 1, data = turbine, weights = n, subset = (n > 0), dist = "lognormal") Value Std. Error z p (Intercept) 3.6999 0.0708 52.23 <2e-16 Log(scale) -0.3287 0.1232 -2.67 0.0076 Scale= 0.72 Log Normal distribution Loglik(model)= -190.7 Loglik(intercept only)= -190.7 Number of Newton-Raphson Iterations: 6 n= 21 > > # Now, do his plot, but put bootstrap confidence bands on it! > # First, make a simple data set without weights > tdata <- turbine[rep(1:nrow(turbine), turbine$n),] > > qstat <- function(data) { + temp <- survreg(Surv(time1, time2, type='interval2') ~1, data=data, + dist='lognormal') + qsurvreg(plist, temp$coef, temp$scale, dist='lognormal') + } > > {if (exists('bootstrap')) { + set.seed(1953) # a good year :-) + bfit <- bootstrap(tdata, qstat, B=1000) + bci <- limits.bca(bfit, probs=c(.025, .975)) + } + else { + values <- matrix(0, nrow=1000, ncol=length(plist)) + n <- nrow(tdata) + for (i in 1:1000) { + subset <- sample(1:n, n, replace=T) + values[i,] <- qstat(tdata[subset,]) + } + bci <- t(apply(values,2, quantile, c(.05, .95))) + } + } > xmat <- cbind(qsurvreg(plist, tfit$coef, tfit$scale, dist='lognormal'), + bci) > > > matplot(xmat, qnorm(plist), + type='l', lty=c(1,2,2), col=c(1,1,1), + log='x', yaxt='n', ylab='Percent', + xlab='Time of Cracking (Hours x 100)') > axis(2, qnorm(plist), format(100*plist), adj=1) > title("Turbine Data") > kfit <- survfit(Surv(time1, time2, type='interval2') ~1, data=tdata) > points(kfit$time, qnorm(1-kfit$surv), pch='+') > > dev.off() #close the plot file null device 1 > > > proc.time() user system elapsed 3.70 0.39 4.10