R version 4.4.0 beta (2024-04-15 r86425 ucrt) -- "Puppy Cup" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # > # Check out the survfit routine on the simple AML data set. > # The leverage validation makes use of the fact that when all > # weights are 1 and there is 1 obs per subject, the IJ variance is > # equal to the Greenwood. > # There are 8 choices in the C code: Nelson-Aalen or Fleming-Harrington > # estimate of cumulative hazard, KM or exp(cumhaz) estimate of survival, > # regular or robust variance. This tries to exercise them all. > > library(survival) > aeq <- function(x, y, ...) all.equal(as.vector(x), as.vector(y), ...) > > set.seed(1953) # used only to reorder the data > adata <- aml > adata$id <- sample(LETTERS, nrow(aml)) # labels are not in time or data order > adata <- adata[sample(1:nrow(aml), nrow(aml)),] # data is unordered > adata$wt <- sample((2:30)/10, nrow(aml)) # non-integer weights > > group <- rep("", nrow(adata)) > temp <- table(adata$x) > group[adata$x == "Maintained"] <- rep(letters[4:1], length=temp[1]) > group[adata$x != "Maintained"] <- rep(letters[4:7], length=temp[2]) > adata$group <- group > > adata2 <- survSplit(Surv(time, status) ~ ., adata, cut=c(10, 20, 40)) > > byhand <- function(time, status, weights, id) { + # for a single curve + utime <- sort(unique(time)) + ntime <- length(utime) + n <- length(time) + if (missing(weights)) weights <- rep(1.0, n) + if (missing(id)) id <- seq(along=time) + + uid <- unique(id) + nid <- length(uid) + id <- match(id, uid) # change it to 1:nid + + n.risk <- n.event <- surv <- cumhaz <- double(ntime) + KM <- 1; nelson <-0; + kvar <- 0; hvar<-0; + + U <- matrix(0, nid, 2) # the two robust influence estimates + V <- matrix(0, ntime, 4) # variances + usave <- array(0., dim=c(nid, 2, ntime)) + estimate <- matrix(0, ntime, 2) + + for (i in 1:ntime) { + atrisk <- (time >= utime[i]) + n.risk[i] <- sum(weights[atrisk]) + deaths <- (time==utime[i] & status==1) + n.event[i] <- sum(weights[deaths]) + + haz <- n.event[i]/n.risk[i] + dhaz <- (ifelse(deaths,1,0) - ifelse(atrisk, haz, 0))/n.risk[i] + U[,1] <- U[,1]*(1-haz) - KM*tapply(dhaz*weights, id, sum) + V[i,1] <- sum(U[,1]^2) + + U[,2] <- U[,2] + tapply(dhaz* weights, id, sum) #result in 'id' order + V[i,2] <- sum(U[,2]^2) + usave[,,i] <- U + + if (n.event[i] >0 ) { + KM <- KM*(1-haz) + nelson <- nelson + haz + kvar <- kvar + n.event[i]/(n.risk[i] * (n.risk[i] - n.event[i])) + hvar <- hvar + n.event[i]/(n.risk[i]^2) + } + + V[i,3] <- kvar # var of log(S) + V[i,4] <- hvar + estimate[i,] <- c(KM, nelson) + } + dimnames(usave) <- list(uid, c("KM", "chaz"), utime) + dimnames(V) <- list(time=utime, c("KM", "chaz", "Greenwood", "Aalen")) + list(time=utime, n.risk=n.risk, n.event=n.event, estimate=estimate, + std = sqrt(V), influence=usave) + } > > # the byhand function can only handle one group at a time > true1a <- with(subset(adata, x=="Maintained"), byhand(time, status, id=id)) > true1b <- with(subset(adata, x!="Maintained"), byhand(time, status, id=id)) > > # The Greenwood and IJ estimates agree, except for a last point with > # variance of zero. These next few lines verify the byhand() function > aeq(true1a$std[,1], true1a$estimate[,1]*true1a$std[,3]) [1] TRUE > aeq(true1b$std[1:9,1], true1b$estimate[1:9,1]*true1b$std[1:9,3]) [1] TRUE > aeq(true1b$std[10,1], 0) # variance of zero for jackknife [1] TRUE > !is.finite(true1b$std[10,3]) # Inf for Greenwood [1] TRUE > temp <- with(subset(adata, x=="Maintained"), byhand(time, status, id=id, + weights=rep(3,11))) > aeq(temp$std[,1:2], true1a$std[,1:2]) # IJ estimates should be invariant [1] TRUE > > # fit1 uses the standard formulas: NA hazard, KM survival > fit1 <- survfit(Surv(time, status) ~ x, data=adata) > aeq(fit1$surv, c(true1a$estimate[,1], true1b$estimate[,1])) [1] TRUE > aeq(fit1$cumhaz, c(true1a$estimate[,2], true1b$estimate[,2])) [1] TRUE > aeq(fit1$std.err, c(true1a$std[,3], true1b$std[,3])) [1] TRUE > aeq(fit1$std.chaz, c(true1a$std[,4], true1b$std[,4])) [1] TRUE > aeq(fit1$n.risk, c(true1a$n.risk, true1b$n.risk)) [1] TRUE > aeq(fit1$n.event, c(true1a$n.event, true1b$n.event)) [1] TRUE > fit1$logse # logse should be TRUE [1] TRUE > fit1b <- survfit(Surv(tstart, time, status) ~x, data=adata2, id=id) > eqsurv <- function(x, y) { + temp <- c("n.risk", "n.event", "n.censor", "surv", "std.err", "cumhaz", + "std.chaz", "strata", "logse") + if (!is.null(x$influence.surv)) temp <- c(temp, "influence.surv") + if (!is.null(x$influence.chaz)) temp <- c(temp, "influence.chaz") + # need unclass to avoid [.survfit + all.equal(unclass(x)[temp], unclass(y)[temp]) + } > eqsurv(fit1, fit1b) [1] TRUE > fit1c <- survfit(Surv(tstart, time, status) ~x, data=adata2, id=id, entry=TRUE) > aeq(fit1c$time[fit1c$time >0], fit1$time) [1] TRUE > aeq(fit1c$n.enter[fit1c$time==0], c(11, 12)) [1] TRUE > all(fit1c$n.enter[fit1c$time >0] ==0) [1] TRUE > > # fit2 will use the IJ method > fit2 <- survfit(Surv(time, status) ~ x, data=adata, id=id, influence=1) > aeq(fit2$surv, c(true1a$estimate[,1], true1b$estimate[,1])) [1] TRUE > aeq(fit2$cumhaz, c(true1a$estimate[,2], true1b$estimate[,2])) [1] TRUE > aeq(fit2$std.err, c(true1a$std[,1], true1b$std[,1])) [1] TRUE > aeq(fit2$std.chaz, c(true1a$std[,2], true1b$std[,2])) [1] TRUE > aeq(fit2$n.risk, c(true1a$n.risk, true1b$n.risk)) [1] TRUE > aeq(fit2$n.event, c(true1a$n.event, true1b$n.event)) [1] TRUE > !fit2$logse # logse should be FALSE [1] TRUE > fit2b <- survfit(Surv(tstart, time, status) ~ x, data=adata2, id=id, + influence=1) > eqsurv(fit2, fit2b) [1] TRUE > fit2c <- survfit(Surv(tstart, time, status) ~ 1, data=adata2, id=id, + subset=(x=="Maintained"), influence=1) > aeq(fit2$influence.surv[[1]], fit2c$influence.surv) [1] TRUE > r2 <- resid(fit2c, times= fit2c$time, collapse=TRUE) > aeq(r2, fit2c$influence.surv) [1] TRUE > > fit2d <- survfit(Surv(time, factor(status)) ~ x, data=adata, id=id, influence=T) > aeq(fit2d$influence[[1]][,,1], r2) [1] TRUE > r3 <- resid(fit2d, times= fit2c$time, collapse=TRUE) > aeq(r3[adata$x =="Maintained",1,], r2) [1] TRUE > > fit2e <- survfit(Surv(time, factor(status)) ~1, adata, id=id, influence=T, + subset=(x=="Maintained")) > aeq(fit2e$influence, fit2d$influence[[1]]) [1] TRUE > aeq(fit2e$influence[,,1], r2) [1] TRUE > > > # look at the leverage values > fit3 <- survfit(Surv(time, status) ~ x, data=adata, id=id, influence=3) > aeq(fit3$influence.surv[[1]], true1a$influence[,1,]) [1] TRUE > aeq(fit3$influence.surv[[2]], true1b$influence[,1,]) [1] TRUE > aeq(fit3$influence.chaz[[1]], true1a$influence[,2,]) [1] TRUE > aeq(fit3$influence.chaz[[2]], true1b$influence[,2,]) [1] TRUE > fit3b <- survfit(Surv(tstart, time, status) ~x, adata2, id=id, influence=3) > eqsurv(fit3, fit3b) [1] TRUE > > # compute the influence by brute force > tdata <- subset(adata, x != "Maintained") > eps <- 1e-8 > imat1 <- imat2 <- matrix(0., 12, 10) > t1 <- survfit(Surv(time, status) ~x, data=tdata) > for (i in 1:12) { + wtemp <- rep(1.0, 12) + wtemp[i] <- 1 + eps + tfit <-survfit(Surv(time, status) ~x, data=tdata, weights=wtemp) + imat2[i,] <- (tfit$cumhaz - t1$cumhaz)/eps + imat1[i,] <- (tfit$surv - t1$surv)/eps + } > aeq(imat1, true1b$influence[,1,], tol= sqrt(eps)) [1] TRUE > aeq(imat2, true1b$influence[,2,], tol= sqrt(eps)) [1] TRUE > > # Repeat using the Nelson-Aalen hazard and exp(NA) for survival > fit1 <- survfit(Surv(time, status) ~ x, adata, stype=2) > aeq(fit1$surv, exp(-c(true1a$estimate[,2], true1b$estimate[,2]))) [1] TRUE > aeq(fit1$cumhaz, c(true1a$estimate[,2], true1b$estimate[,2])) [1] TRUE > aeq(fit1$std.err, c(true1a$std[,4], true1b$std[,4])) [1] TRUE > aeq(fit1$std.chaz, c(true1a$std[,4], true1b$std[,4])) [1] TRUE > aeq(fit1$n.risk, c(true1a$n.risk, true1b$n.risk)) [1] TRUE > fit1b <- survfit(Surv(tstart, time, status) ~x, adata2, stype=2, id=id) > eqsurv(fit1, fit1b) [1] TRUE > > # Nelson-Aalen + exp() surv, along with IJ variance > fit2 <- survfit(Surv(time, status) ~ x, data=adata, id=id, stype=2, + influence=3) > aeq(fit2$surv, exp(-c(true1a$estimate[,2], true1b$estimate[,2]))) [1] TRUE > aeq(fit2$cumhaz, c(true1a$estimate[,2], true1b$estimate[,2])) [1] TRUE > aeq(fit2$std.err, c(true1a$std[,2], true1b$std[,2])) [1] TRUE > aeq(fit2$std.chaz, c(true1a$std[,2], true1b$std[,2])) [1] TRUE > aeq(fit2$n.risk, c(true1a$n.risk, true1b$n.risk)) [1] TRUE > aeq(fit2$influence.chaz[[1]], true1a$influence[,2,]) [1] TRUE > aeq(fit2$influence.chaz[[2]], true1b$influence[,2,]) [1] TRUE > aeq(fit2$influence.surv[[2]], -true1b$influence[,2,]%*% diag(fit2[2]$surv)) [1] TRUE > fit2b <- survfit(Surv(tstart, time, status) ~x, data=adata2, id=id, stype=2, + influence=3) > eqsurv(fit2, fit2b) [1] TRUE > # Cumulative hazard is the same for fit1 and fit2 > all.equal(fit2$influence.chaz, fit2b$influence.chaz) [1] TRUE > > # Weighted fits > true2a <- with(subset(adata, x=="Maintained"), byhand(time, status, id=id, + weights= wt)) > true2b <- with(subset(adata, x!="Maintained"), byhand(time, status, id=id, + weights=wt)) > fit3 <- survfit(Surv(time, status) ~ x, data=adata, id=id, weights=wt, + influence=TRUE) > aeq(fit3$influence.surv[[1]], true2a$influence[,1,]) [1] TRUE > aeq(fit3$influence.surv[[2]], true2b$influence[,1,]) [1] TRUE > aeq(fit3$influence.chaz[[1]], true2a$influence[,2,]) [1] TRUE > aeq(fit3$influence.chaz[[2]], true2b$influence[,2,]) [1] TRUE > aeq(fit3$surv, c(true2a$estimate[,1], true2b$estimate[,1])) [1] TRUE > aeq(fit3$cumhaz, c(true2a$estimate[,2], true2b$estimate[,2])) [1] TRUE > aeq(fit3$std.err, c(true2a$std[,1], true2b$std[,1])) [1] TRUE > aeq(fit3$std.chaz, c(true2a$std[,2], true2b$std[,2])) [1] TRUE > aeq(fit3$n.risk, c(true2a$n.risk, true2b$n.risk)) [1] TRUE > aeq(fit3$n.event, c(true2a$n.event, true2b$n.event)) [1] TRUE > fit3b <- survfit(Surv(tstart, time, status) ~x, adata2, id=id, weights=wt, + influence=TRUE) > eqsurv(fit3, fit3b) [1] TRUE > > # Different survival, same hazard > fit3b <- survfit(Surv(time, status) ~ x, data=adata, id=id, weights=wt, + influence=2, stype=2) > temp <- c("n", "time", "cumhaz", "std.chaz", "influence.chaz", "n.risk", + "n.event") > aeq(unclass(fit3b)[temp], unclass(fit3)[temp]) # unclass avoids [.survfit [1] TRUE > aeq(fit3b$surv, exp(-c(true2a$estimate[,2], true2b$estimate[,2]))) [1] TRUE > aeq(fit3b$std.err, fit3b$std.chaz) [1] TRUE > aeq(fit3b$logse, FALSE) [1] TRUE > aeq(fit3b$n.risk, c(true2a$n.risk, true2b$n.risk)) [1] TRUE > aeq(fit3b$n.event, c(true2a$n.event, true2b$n.event)) [1] TRUE > > # The grouped jackknife > fit4 <- survfit(Surv(time, status) ~ x, data=adata, id=id, weights=wt, + influence=TRUE, cluster=group) > g1 <- adata$group[match(rownames(true2a$influence[,1,]), adata$id)] > g2 <- adata$group[match(rownames(true2b$influence[,1,]), adata$id)] > aeq(fit4$influence.surv[[1]], rowsum(true2a$influence[,1,], g1, reorder=FALSE)) [1] TRUE > aeq(fit4$influence.surv[[2]], rowsum(true2b$influence[,1,], g2, reorder=FALSE)) [1] TRUE > aeq(fit4$influence.chaz[[1]], rowsum(true2a$influence[,2,], g1, reorder=FALSE)) [1] TRUE > aeq(fit4$influence.chaz[[2]], rowsum(true2b$influence[,2,], g2, reorder=FALSE)) [1] TRUE > > aeq(c(colSums(fit4$influence.surv[[1]]^2), colSums(fit4$influence.surv[[2]]^2)), + fit4$std.err^2) [1] TRUE > aeq(c(colSums(fit4$influence.chaz[[1]]^2), colSums(fit4$influence.chaz[[2]]^2)), + fit4$std.chaz^2) [1] TRUE > > # The Fleming-Harrington is a more complex formula. Start with weights of > # 1. > fit5 <- survfit(Surv(time, status) ~x, adata, ctype=2) > nrisk <- c(11,10,8,7, 5,4,2, 12, 11, 10, 9, 8, 6:1) > chaz <- c(cumsum(1/nrisk[1:7])[c(1:4,4, 5,6,6,7,7)], + cumsum(1/nrisk[8:18])[c(2,4,5,5,6:11)]) > aeq(fit5$cumhaz, chaz) [1] TRUE > aeq(fit5$std.chaz, sqrt(c(cumsum(1/nrisk[1:7]^2)[c(1:4,4, 5,6,6,7,7)], + cumsum(1/nrisk[8:18]^2)[c(2,4,5,5,6:11)]))) [1] TRUE > > # We can compute the FH using a fake data set where each tie is spread out > # over a set of fake times. > # > fh <- function(time, status, weights, id) { + counts <- table(time, status) + utime <- sort(unique(time)) + tied <- counts[,2] > 1 + + if (missing(weights)) weights <- rep(1.0, length(time)) + if (missing(id)) id <- 1:length(time) + + # build the expanded data set + delta <- min(diff(utime))/(2*max(counts[,2])) + efun <- function(x) { + who <- which(time==x & status==1) + ntie <- length(who) + data.frame(time = rep(x - (1:ntie -1)*delta, each=ntie), + id = rep(id[who], ntie), + status = rep(1, ntie^2), + weight = rep(weights[who]/ntie, ntie), + stringsAsFactors=FALSE + ) + } + + temp <- do.call(rbind, lapply(utime[tied], efun)) + notie <- (status==0 | !(time %in% utime[tied])) + + bfit <- byhand(time = c(time[notie], temp$time), + status = c(status[notie], temp$status), + id = c(id[notie], temp$id), + weights = c(weights[notie], temp$weight) + ) + keep <- match(utime, bfit$time) # the real time points + + # The influence from survfit is in data order, which we have perturbed. + # Fix that + indx <- match(unique(id), dimnames(bfit$influence)[[1]]) + + list(time=bfit$time[keep], + n.risk=bfit$n.risk[keep - pmax(0, counts[,2]-1)], + n.event = bfit$n.event[keep]* counts[,2], + estimate=bfit$estimate[keep,], + std = bfit$std[keep,], influence=bfit$influence[indx,,keep]) + } > > # Case weights > true6a <- with(subset(adata, x=="Maintained"), fh(time, status, wt, id)) > true6b <- with(subset(adata, x!="Maintained"), fh(time, status, wt, id)) > > fit6 <- survfit(Surv(time, status) ~ x, weight=wt, data=adata, stype=2, + ctype=2, robust=FALSE) > aeq(fit6$cumhaz, c(true6a$estimate[,2], true6b$estimate[,2])) [1] TRUE > aeq(fit6$surv, exp(-c(true6a$estimate[,2], true6b$estimate[,2]))) [1] TRUE > aeq(fit6$std.chaz, c(true6a$std[,4], true6b$std[,4])) [1] TRUE > aeq(fit6$n.risk, c(true6a$n.risk, true6b$n.risk)) [1] TRUE > aeq(fit6$n.event, c(true6a$n.event, true6b$n.event)) [1] TRUE > > # Robust variance > fit7 <- survfit(Surv(time, status) ~ x, weight=wt, data=adata, stype=2,ctype=2, + id=id, influence=2, robust=TRUE) > aeq(fit7$cumhaz, c(true6a$estimate[,2], true6b$estimate[,2])) [1] TRUE > aeq(fit7$surv, exp(-c(true6a$estimate[,2], true6b$estimate[,2]))) [1] TRUE > aeq(fit7$std.chaz, c(true6a$std[,2], true6b$std[,2])) [1] TRUE > aeq(fit7$n.risk, c(true6a$n.risk, true6b$n.risk)) [1] TRUE > aeq(fit7$n.event, c(true6a$n.event, true6b$n.event)) [1] TRUE > aeq(fit7$influence.chaz[[1]], true6a$influence[,2,]) [1] TRUE > aeq(fit7$influence.chaz[[2]], true6b$influence[,2,]) [1] TRUE > > > # compute the influence by brute force > tdata <- subset(adata, x != "Maintained") > eps <- 1e-8 > imat <- matrix(0., 12, 10) > t1 <- survfit(Surv(time, status) ~x, data=tdata, ctype=2, weights=wt) > for (i in 1:12) { + wtemp <- tdata$wt + wtemp[i] <- wtemp[i] + eps + tfit <-survfit(Surv(time, status) ~x, data=tdata, ctype=2, + weights=wtemp) + imat[i,] <- tdata$wt[i] * (tfit$cumhaz - t1$cumhaz)/eps + } > aeq(fit7$influence.chaz[[2]], imat, tol=sqrt(eps)) [1] TRUE > > # > # verify that the times and scale arguments work as expected. They > # are in the summary and print.survfit functions. > # > s1 <- summary(fit1, scale=1) > s2 <- summary(fit1, scale=2) > aeq(s1$time/2, s2$time) #times change [1] TRUE > aeq(s1$surv, s2$surv) [1] TRUE > tscale <- rep(c(1,1,1,1, 2,2,2,2,2), each=2) > aeq(s1$table, s2$table *tscale) [1] TRUE > > s3 <- summary(fit1, scale=1, times=c(9, 18, 23, 33, 34)) > s4 <- summary(fit1, scale=2, times=c(9, 18, 23, 33, 34)) > aeq(s3$time, s4$time*2) [1] TRUE > aeq(s3$surv, s4$surv) [1] TRUE > > print(fit1, rmean='common') Call: survfit(formula = Surv(time, status) ~ x, data = adata, stype = 2) n events rmean* se(rmean) median 0.95LCL 0.95UCL x=Maintained 11 7 60.3 25.60 34 23 NA x=Nonmaintained 12 11 30.1 9.14 27 8 NA * restricted mean with upper limit = 161 > print(fit1, rmean='common', scale=2) Call: survfit(formula = Surv(time, status) ~ x, data = adata, stype = 2) n events rmean* se(rmean) median 0.95LCL 0.95UCL x=Maintained 11 7 30.2 12.80 17.0 11.5 NA x=Nonmaintained 12 11 15.1 4.57 13.5 4.0 NA * restricted mean with upper limit = 80.5 > > proc.time() user system elapsed 1.00 0.12 1.11