R version 4.4.0 RC (2024-04-16 r86458 ucrt) -- "Puppy Cup" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > options(na.action=na.exclude) # preserve missings > options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type > library(survival) > > # > # The test data set from Turnbull, JASA 1974, 169-73. > # > # status 0=right censored > # 1=exact > # 2=left censored > # > aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...) > > > turnbull <- data.frame( time =c( 1,1,1, 2,2,2, 3,3,3, 4,4,4), + status=c( 1,0,2, 1,0,2, 1,0,2, 1,0,2), + n =c(12,3,2, 6,2,4, 2,0,2, 3,3,5)) > # > # Compute the K-M for the Turnbull data > # via a slow EM calculation > # > > emsurv <- function(time, status, wt, verbose=T) { + left.cen <- (status==2) + if (!any(left.cen)) stop("No left censored data!") + if (!any(status==1))stop("Must have some exact death times") + + tempy <- Surv(time[!left.cen], status[!left.cen]) + ww <- wt[!left.cen] + tempx <- factor(rep(1, sum(!left.cen))) + tfit <- survfit(tempy~tempx, weight=ww) + if (verbose) + cat("Iteration 0, survival=", format(round(tfit$surv[tfit$n.event>0],3)), + "\n") + + stimes <- tfit$time[tfit$n.event>0] + ltime <- time[left.cen] + lwt <- wt[left.cen] + tempx <- factor(rep(1, length(stimes) + sum(!left.cen))) + tempy <- Surv(c(time[!left.cen], stimes), + c(status[!left.cen], rep(1, length(stimes)))) + for (iter in 1:4) { + wt2 <- stimes*0 + ssurv <- tfit$surv[tfit$n.event>0] + sjump <- diff(c(1, ssurv)) + for (j in 1:(length(ltime))) { + k <- sum(ltime[j]>=stimes) #index of the death time + if (k==0) + stop("Left censored observation before the first death") + wt2[1:k] <- wt2[1:k] + lwt[j]*sjump[1:k] /(ssurv[k]-1) + } + tfit <- survfit(tempy~tempx, weight=c(ww, wt2)) + if (verbose) { + cat("Iteration", iter, "survival=", + format(round(tfit$surv[tfit$n.event>0],3)), "\n") + cat(" weights=", format(round(wt2,3)), "\n") + } + } + survfit(tempy ~ tempx, weights=c(ww, wt2), robust=FALSE) + } > > temp <-emsurv(turnbull$time, turnbull$status, turnbull$n) Iteration 0, survival= 0.613 0.383 0.287 0.144 Iteration 1 survival= 0.549 0.303 0.214 0.094 weights= 7.856 3.477 0.828 0.839 Iteration 2 survival= 0.540 0.296 0.210 0.095 weights= 8.228 3.394 0.714 0.664 Iteration 3 survival= 0.538 0.295 0.210 0.095 weights= 8.315 3.356 0.690 0.638 Iteration 4 survival= 0.538 0.295 0.210 0.095 weights= 8.338 3.342 0.685 0.635 > print(summary(temp)) Call: survfit(formula = tempy ~ tempx, weights = c(ww, wt2), robust = FALSE) time n.risk n.event survival std.err lower 95% CI upper 95% CI 1 44.00 20.34 0.5378 0.0752 0.4089 0.707 2 20.66 9.34 0.2946 0.0719 0.1827 0.475 3 9.32 2.68 0.2098 0.0673 0.1119 0.393 4 6.64 3.64 0.0948 0.0507 0.0333 0.270 > # First check, use the data from Turnbull, JASA 1974, 169-173. > > tdata <- data.frame(time =c(1,1,1,2,2,2,3,3,3,4,4,4), + status=rep(c(1,0,2),4), + n =c(12,3,2,6,2,4,2,0,2,3,3,5)) > > tfit <- survfit(Surv(time, time, status, type='interval') ~1, tdata, weight=n) > all.equal(round(tfit$surv,3), c(.538, .295, .210, .095)) [1] TRUE > > > # Second check, compare to a reversed survival curve > # This is not as simple a test as one might think, because left and right > # censored observations are not treated symmetrically by the routine: > # time <= y for left and time> y for right (this is to make the routine > # correct for the common situation of panel data). > # To get equivalence, make the left censoreds happen just a little bit > # earlier. The left-continuous/right-continuous shift is also a bother. > # > test1 <- data.frame(time= c(9, 3,1,1,6,6,8), + status=c(1,NA,1,0,1,1,0), + x= c(0, 2,1,1,1,0,0)) > fit1 <- survfit(Surv(time, status) ~1, test1) > temp <- ifelse(test1$status==0, 4.99,5) - test1$time > fit2 <- survfit(Surv(temp, status, type='left') ~1, test1) > > all.equal(round(fit1$surv[1:2],5), round(1-fit2$surv[3:2],5)) [1] TRUE > > rm(tdata, tfit, fit1, temp, fit2) > # > # Create a data set similar to the one provided by Al Zinsmeister > # It is a hard test case for survfit.turnbull > # > time1 <- c(rep(0,100), rep(1,200), 100, 200, 210, 220, + rep(365,100), rep(366,5), 731:741) > > time2 <- c((1:100)*3, 10+1:100, rep(365:366, c(60,40)), NA, 500, NA, 450, + rep(730,90), rep(NA,10), c(528,571,691,730,731), + NA, 1095:1099, NA, 1400, 1200, 772, 1461) > > zfit <- survfit(Surv(time1, time2, type='interval2') ~1) > > # > # There are 100 intervals of the form (0,x) where x is from 3 to 300, > # and 200 more of the form (1,x) where x is from 11 to 366. These > # lead to a mass point in the interval (1,3), which is placed at 2. > # The starting estimate has far too little mass placed here, and it takes > # the EM a long time to realize that most of the weight for the first 300 > # subjects goes here. With acceleration, it takes 16 iterations, without > # it takes >40. (On Al's orginal data, without accel still wasn't there after > # 165 iters!) > # > # The next 4 obs give rise to potential jumps at 100.5, 200.5, 211.5, and > # 221. However, the final estimate has no mass at all on any of these. > # Assume mass of a,b, and c at 2, 100.5 and 365.5, and consider the > # contributions: > # 123 obs that overlap a only > # 137 obs that overlap a and b > # 40 obs that overlap a, b, c > # 1 obs that overlap b, c > # 108 obs that overlap c (200, 210,200, 365, and 366 starting points) > # For some trial values of a,b,c, compare the loglik to that of (a+b),0,c > # First one: a^123 (a+b)^137 (a+b+c)^40 (b+c) c^108 > # Second: (a+b)^123 (a+b)^137 (a+b+c)^40 c c^108 > # Likelhood improves if (1 + b/a)^123 > 1+ b/c, which is true for almost > # all a and c. In particular, at the solution a and c are approx .7 and > # .18, respectively. > # > # The program can't see this coming, of course, and so iterates towards a > # KM with epsilon sized jumps at 100.5, 200.5, and 211.5. Whether these > # intervals should be removed during iteration, as detected, is an open > # question for me. > # > # > # True solution: mass points at 2, 365.5, 408, and 756.5, of sizes a, b, c, d > # Likelihood: a^260 (a+b)^40 (b+c)^92 (b+c+d)^12 c^5 d^11 > # Solution: a=0.6958, b=0.1674, c=0.1079, d=0.0289 > > tfun <- function(x) { + if (length(x) ==3) x <- c(x, .03) + x <- x/sum(x) #make probabilities sum to 1 + loglik <- 260*log(x[1]) + 40*log(x[1]+x[2]) + 92*log(x[2] + x[3]) + + 12*log(x[2]+x[3]+x[4]) + 5*log(x[3]) + 11*log(x[4]) + -loglik #find the max, not the min + } > > nfit <- nlminb(start=c(.7,.15, .1), tfun, lower=0, upper=1) > nparm <- c(nfit$par, .03) > nparm <- nparm / sum(nparm) > zparm <- -diff(c(1, zfit$surv[match(c(2, 365.5, 408, 756.5), zfit$time)])) > aeq(round(tfun(nparm),4), round(tfun(zparm),4)) [1] TRUE > # .0001 is the tolerance in survfit.turnbull > > rm(tfun, nfit, nparm, zparm, time1, time2, zfit) > > proc.time() user system elapsed 0.85 0.10 0.95