############################################################################################################# ######### This R code is for a single real data application based on subHMM ################################# ############################################################################################################# library('facets') library(stringr) rcmat=readSnpMatrix("TCGA-KL-8331_5_0.pileup.txt.gz") sampling.ratio <- 10 set.seed(2017) # for WGS #xx=preProcSample(rcmat, ndepth = 15, cval = 75, snp.nbhd = 2000) #oo=procSample(xx, cval=500) # for WES xx=preProcSample(rcmat,ndepth = 5) oo=procSample(xx,cval=150) fit=emcncf(oo) inputs <- oo$jointseg[,11:12] lr <- inputs[,1] logor <- inputs[,2] idx_thin <- seq(1, length(lr), sampling.ratio) chrloc_thin <- oo$jointseg[idx_thin, c("chrom","maploc")] lr <- lr[idx_thin] logor <- logor[idx_thin] if(is.na(fit$purity)){ facets.purity = 0.8}else{ facets.purity = fit$purity} if(is.na(fit$ploidy)){ facets.ploidy = 2}else{ facets.ploidy = fit$ploidy} # Get the genotype states genoStates=c("", "A", "AA", "AB","AAA","AAB","AAAA","AABB", "AAAB","AAAAA","AAABB","AAAAB") mainJ <- length(genoStates) tauJ <- mainJ -1 n <- length(lr) logor2 <- logor^2 logor2[which(logor2==0)] <- 0.000001 notMissing <- !is.na(logor2) noMissing <- all(notMissing) logL_ <- function(parm){ p1 <- parm[1] p2 <- parm[2] p3 <- parm[3] p4 <- parm[4] p5 <- parm[5] p6 <- parm[6] mu0_lrs <- mu0_lr(J, p1, p2, p3) mu0_lors <- mu0_lor(J, p2, p3) sigma20_lr <- exp(p4) ep5 <- exp(-p5) ncp <- mu0_lors^2*ep5 dmat <- dchisq(rep(logor2*ep5, each=J), df=1, ncp=rep(ncp, times=n), log=T) dmat <- matrix(dmat, nrow=n, ncol=J, byrow=TRUE) mat <- (-0.5*log(2*pi*sigma20_lr) + lgamma((p6+1)/2) - lgamma(p6/2) + 0.5*p6*log(p6/2) - 0.5*(p6+1)*log(0.5*(p6+(lr-matrix(mu0_lrs, nrow=n, ncol=J, byrow=TRUE))^2/sigma20_lr)) )*pzks1 if (noMissing) { mat <- mat - pzks1*(p5 - dmat) } else { mat[notMissing, ] <- mat[notMissing, , drop=FALSE] - pzks1[notMissing, , drop=FALSE]*(p5 - dmat[notMissing, , drop=FALSE]) } logf <- sum(mat) return(-logf) } logL_reg <- function(parm){ mu0_lrs <- mu0_lr(J,lpsi0, lpr0, parm) mu0_lors <- mu0_lor(J, lpr0, parm) sigma20_lr <- exp(lsigma20_lr) logf <- 0 for (k in sub_reg) { if(is.na(logor2[k])) { logf <- logf+sum((-0.5*log(2*pi*sigma20_lr)+lgamma((v0+1)/2)-lgamma(v0/2) +0.5*v0*log(v0/2)-0.5*(v0+1)*log(0.5*(v0+(lr[k]-mu0_lrs)^2/sigma20_lr)))*pzks1[k,]) } else { logf <- logf+sum(((-0.5*log(2*pi*sigma20_lr)+lgamma((v0+1)/2)-lgamma(v0/2) +0.5*v0*log(v0/2)-0.5*(v0+1)*log(0.5*(v0+(lr[k]-mu0_lrs)^2/sigma20_lr)))*pzks1[k,]- pzks1[k,]*lsigma20_lor+pzks1[k,]*dchisq(logor2[k]*exp(-lsigma20_lor),df=1, ncp=mu0_lors^2*exp(-lsigma20_lor), log=T))) } } return(-logf) } # region-specific Sz0 lsz_est <- function(sub_reg) { m_est_s <- optim(lsz0, logL_reg, method="BFGS", control=list(trace=6)) lsz0 <- m_est_s$par return(lsz0) } # These functions are needed to evaluate logL_ function mu0_lr <- function(nj, lpsi0, lpr0, lsz0) { ret <- rep(NA, nj) pr0 <- 1/(1+exp(lpr0)) sz0 <- 1/(1+exp(lsz0)) jv1 <- 1:mainJ jv2 <- (mainJ+1):nj ret[jv1] <- log((2+pr0*(ctzs[jv1]-2)), base=2)-lpsi0 tmp1 <- ctms[jv2 - mainJ] ret[jv2] <- log(2+(tmp1-2)*pr0+(ctzs[jv2]-tmp1)*sz0*pr0, base=2)-lpsi0 ret } mu0_lor <- function(nj, lpr0, lsz0) { ret <- rep(NA, nj) pr0 <- 1/(1+exp(lpr0)) sz0 <- 1/(1+exp(lsz0)) jv1 <- 1:mainJ jv2 <- (mainJ+1):nj ret[jv1] <- log((mz[jv1]*pr0+(1-pr0)))-log(pz[jv1]*pr0+(1-pr0)) tmpj <- jv2 - mainJ tmp1 <- mz_sub[tmpj] tmp2 <- pz_sub[tmpj] ret[jv2] <- log(1+(mz[jv2]-tmp1)*sz0*pr0+(tmp1-1)*pr0) - log(1+(pz[jv2]-tmp2)*sz0*pr0+(tmp2-1)*pr0) ret } ######################################################################## # Expand the state-space and re-define the space-related notation main_zk <- c("0", "A", "AA", "AB","AAA","AAB","AAAA","AABB","AAAB","AAAAA", "AAABB","AAAAB") mainJ <- length(main_zk) tauJ <- mainJ-1 allzk_main <- c(main_zk, rep(main_zk,each=tauJ)) sub_zk <- c(sapply(1:mainJ, function(x) main_zk[-x])) ctzs <- nchar(allzk_main) # main genotype copy number ctzs[which(allzk_main=="0")] <- 0 ctms <- nchar(sub_zk) # subclone genotype copy number ctms[which(sub_zk=="0")] <- 0 J <- length(allzk_main) mz <- str_count(allzk_main, "A") # A allele frequency corresponding to z mz_sub <- str_count(sub_zk, "A") # A allele of subclone genotype pz <- str_count(allzk_main, "B") # B allele frequency corresponding to z pz_sub <- str_count(sub_zk, "B") # B allele of subclone genotype main_geneidx <- c(1:mainJ,rep(1:mainJ, each=tauJ)) # set initial values lpsi0 <- log(facets.ploidy, base=2) #facets estimates lpr0 <- log((1-facets.purity)/facets.purity) # facets estimates lsz0 <- log(0.2/0.8) lsigma20_lr <- log(0.5) sigma20_lr <- exp(lsigma20_lr) lsigma20_lor <- log(0.5) v0 <- 3 # estimate the initial probability of w: P(w_1=w) r0 <- rep(1/mainJ, mainJ) u0 <- rep(1/2,2) hatp0_M <- matrix(c(rep(c(1-tauJ/5000,rep(1/5000,J/mainJ)),tauJ),1-tauJ/5000),J/mainJ,J/mainJ) hatp0_m <- c(0.999, 0.001, 0.001,0.999) # initial probability of Tk that follows a markov chain pt <- diag(tauJ) # The transition matrix of Tk is given as the identity matrix so this is NOT a parameter to estimate hatp0_t0 <- c(rep(1/tauJ,tauJ), rep(1/tauJ,tauJ*tauJ)) ### calculate the transition matrix of all Z,U,T ################################################################ hatp01 <- NULL for(h in 1:mainJ){ hatp_0s <- numeric(mainJ*tauJ) for(hh in 1:mainJ){ hatp_0s[((hh-1)*tauJ+1):(hh*tauJ)] <- hatp0_M[h,hh]*hatp0_m[2]*hatp0_t0[((hh-1)*tauJ+1):(hh*tauJ)] } hatp01 <- rbind(hatp01, hatp_0s) } rownames(hatp01) <- main_zk hatp11 <- NULL for(h in 1:mainJ){ hatp_11s <- matrix(0,tauJ,mainJ*tauJ) hatp_11s[,((h-1)*tauJ+1):(h*tauJ)] <- hatp0_M[h,h]*hatp0_m[4]*pt for(hh in c(1:mainJ)[-h]){ hatp_11s[,((hh-1)*tauJ+1):(hh*tauJ)] <- hatp0_M[h,hh]*hatp0_m[4]*matrix(c(hatp0_t0[((hh-1)*tauJ+1):(hh*tauJ)]), byrow=T, nrow=tauJ, ncol=tauJ) } hatp11 <- rbind(hatp11, hatp_11s) } hatp0 <- matrix(0,J,J) for(i in 1:mainJ){ hatp0[i,] <- c(hatp0_M[i,]*hatp0_m[1], hatp01[i,] ) } hatp10 <- NULL for(i in 1:mainJ){ temp <- matrix(c(hatp0_M[i,]*hatp0_m[3]), nrow=1) temp1 <- matrix(rep(temp,tauJ), byrow=T, ncol=mainJ) hatp10 <- rbind(hatp10, temp1) } hatp0[(mainJ+1):J,1:mainJ] <- hatp10 hatp0[(mainJ+1):J,(mainJ+1):J] <- hatp11 r0s <- c(r0*u0[1], c(sapply(1:mainJ, function(j) sapply(1:tauJ, function(t) r0[j]*u0[2]*hatp0_t0[(j-1)*tauJ+t])))) M <- 200 logL_Y <- numeric(M) sigma20_lr <- exp(lsigma20_lr) sigma20_lor <- exp(lsigma20_lor) mu0_lrs <- mu0_lr(J,lpsi0, lpr0, lsz0) mu0_lors <- mu0_lor(J, lpr0, lsz0) a1d <- if(is.na(logor2[1])) {sapply(1:J,function(i) (0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[1]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*r0s[i]) } else { sapply(1:J,function(i) (0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[1]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*exp(-lsigma20_lor)*dchisq(logor2[1]*exp(-lsigma20_lor),df=1, ncp=(mu0_lors[i])^2*exp(-lsigma20_lor))*r0s[i]) } c1 <- 1/sum(a1d) a1h <- c1*a1d akh_all <- matrix(NA, n,J) akh_all[1,] <- a1h ca_all <- numeric(n) ca_all[1] <- c1 for (k in 2:n){ akd <- if(is.na(logor2[k])) {(matrix(a1h, nrow=1)%*%hatp0)*((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))} else { (matrix(a1h, nrow=1)%*%hatp0)*(((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))* (exp(-lsigma20_lor)*dchisq(logor2[k]*exp(-lsigma20_lor),df=1, ncp=mu0_lors^2*exp(-lsigma20_lor))))} cka <- 1/sum(akd) ca_all[k] <- cka akh <- cka*akd akh_all[k,] <- akh a1h <- akh } logL_Y[1] <- sum(sum(-log(ca_all[1:n]))+log(sum(akh_all[n,]))) watch <- Sys.time() for (mm in 1:M){ # Scaling HMM with forward-backward alg. in E-step bnd <- rep(1,J) cn <- 1/sum(bnd) bnh <- cn*bnd cb_all <- numeric(n) cb_all[n] <- cn # calcluate all b_l(k) : backward bkh_all <- matrix(NA,n,J) bkh_all[n,] <- bnh for (k in (n-1):1) { bkd <- if(is.na(logor2[k+1])) {hatp0%*%matrix((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k+1]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2))*bnh, ncol=1)} else {hatp0%*%matrix(((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k+1]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))* (exp(-lsigma20_lor)*dchisq(logor2[k+1]*exp(-lsigma20_lor),df=1, ncp=mu0_lors^2*exp(-lsigma20_lor)))*bnh,ncol=1)} ckb <- 1/sum(bkd) cb_all[k] <- ckb bkh <- ckb*bkd bkh_all[k,] <- bkh bnh <- bkh } pzks <- t(sapply(1:n, function(i) { pzki <- akh_all[i,]*bkh_all[i,]/sum(akh_all[n,]) return(pzki)} )) lca_all <- log(ca_all) lcb_all <- log(cb_all) pzks1 <- t(cbind(sapply(1:(n-1), function(i) exp(sum(lca_all[(i+1):n])-sum(lcb_all[i:n]))*pzks[i,]), pzks[n,]/cb_all[n])) pzzks <- array(NA, c(J,J,n-1)) for( k in 2:(n-1)){ pzzks[,,k-1] <- if (is.na(logor2[k])) {exp(sum(log(ca_all[(k+1):n]))-sum(log(cb_all[k:n])))*ca_all[k]*(hatp0* (akh_all[k-1,]%*%matrix(bkh_all[k,]*((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2))) , nrow=1)))/sum(akh_all[n,])} else {exp(sum(log(ca_all[(k+1):n]))-sum(log(cb_all[k:n])))*ca_all[k]*( hatp0*(akh_all[k-1,]%*%matrix(bkh_all[k,]*((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2))* exp(-lsigma20_lor)*dchisq(logor2[k]*exp(-lsigma20_lor),df=1, ncp=mu0_lors^2*exp(-lsigma20_lor))), nrow=1)))/sum(akh_all[n,])} } pzzks[,,(n-1)] <- if (is.na(logor2[n])) {sapply(1:J, function(i) sapply(1:J, function(j) exp(sum(log(ca_all[n:n])))*hatp0[j,i]*akh_all[n-1,j]*(0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[n]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))/sum(akh_all[n,])))} else {sapply(1:J, function(i) sapply(1:J, function(j) exp(sum(log(ca_all[n:n])))*hatp0[j,i]*akh_all[n-1,j]*(0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[n]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*exp(-lsigma20_lor)*dchisq(logor2[n]*exp(-lsigma20_lor),df=1, ncp=mu0_lors[i]^2*exp(-lsigma20_lor))/sum(akh_all[n,])))} ######### M-step ####################################### hatp0_M <- sapply(1:mainJ, function(g) sapply(1:mainJ, function(l) (sum(pzzks[l,g,])+sum(pzzks[l,(1+mainJ+(g-1)*tauJ):(1+mainJ+g*tauJ-1),])+sum(pzzks[(1+mainJ+(l-1)*tauJ):(1+mainJ+l*tauJ-1),g,])+sum(pzzks[(1+mainJ+(l-1)*tauJ):(1+mainJ+l*tauJ-1),(1+mainJ+(g-1)*tauJ):(1+mainJ+g*tauJ-1),]))/(sum(pzzks[l,,])+sum(pzzks[(1+mainJ+(l-1)*tauJ):(1+mainJ+l*tauJ-1),,])))) hatp0_m01 <- sum(pzzks[1:mainJ,(mainJ+1):J,])/sum(pzzks[1:mainJ,,]) hatp0_m10 <- sum(pzzks[(mainJ+1):J, 1:mainJ,])/sum(pzzks[(mainJ+1):J,,]) hatp0_m <-c(1-hatp0_m01, hatp0_m01, hatp0_m10, 1-hatp0_m10) hatp0_t0 <- c(sapply(1:mainJ, function(g) sapply(1:tauJ, function(t) (pzks1[1,(mainJ+(g-1)*tauJ+t)]+ sum(pzzks[1:mainJ, (mainJ+(g-1)*tauJ+t),])+sum(pzzks[((mainJ+1):J)[-(((g-1)*tauJ+1):(g*tauJ))],(mainJ+(g-1)*tauJ+t), ]))/ (sum(pzks1[1,(mainJ+(g-1)*tauJ+1):(mainJ+g*tauJ)])+sum(pzzks[1:mainJ, (mainJ+(g-1)*tauJ+1):(mainJ+g*tauJ),])+ sum(pzzks[((mainJ+1):J)[-(((g-1)*tauJ+1):(g*tauJ))],(mainJ+(g-1)*tauJ+1):(mainJ+g*tauJ), ]))))) r0 <- sapply(1:mainJ, function(j) sum(pzks1[1,c(j,(mainJ+1+tauJ*(j-1)):(mainJ+tauJ*j))])/sum(pzks1[1,])) u0 <- c(sum(pzks1[1,1:mainJ]), sum(pzks1[1,(mainJ+1):J]))/sum(pzks1[1,]) r0s <- c(r0*u0[1], c(sapply(1:mainJ, function(j) sapply(1:tauJ, function(t) r0[j]*u0[2]*hatp0_t0[(j-1)*tauJ+t])))) hatp01 <- NULL for(h in 1:mainJ){ hatp_0s <- numeric(mainJ*tauJ) for(hh in 1:mainJ){ hatp_0s[((hh-1)*tauJ+1):(hh*tauJ)] <- hatp0_M[h,hh]*hatp0_m[2]*hatp0_t0[((hh-1)*tauJ+1):(hh*tauJ)] } hatp01 <- rbind(hatp01, hatp_0s) } rownames(hatp01) <- main_zk hatp11 <- NULL for(h in 1:mainJ){ hatp_11s <- matrix(0,tauJ,mainJ*tauJ) hatp_11s[,((h-1)*tauJ+1):(h*tauJ)] <- hatp0_M[h,h]*hatp0_m[4]*pt for(hh in c(1:mainJ)[-h]){ hatp_11s[,((hh-1)*tauJ+1):(hh*tauJ)] <- hatp0_M[h,hh]*hatp0_m[4]*matrix(c(hatp0_t0[((hh-1)*tauJ+1):(hh*tauJ)]), byrow=T, nrow=tauJ, ncol=tauJ) } hatp11 <- rbind(hatp11, hatp_11s) } hatp0 <- matrix(0,J,J) for(i in 1:mainJ){ hatp0[i,] <- c(hatp0_M[i,]*hatp0_m[1], hatp01[i,] ) } hatp10 <- NULL for(i in 1:mainJ){ temp <- matrix(c(hatp0_M[i,]*hatp0_m[3]), nrow=1) temp1 <- matrix(rep(temp,tauJ), byrow=T, ncol=mainJ) hatp10 <- rbind(hatp10, temp1) } hatp0[(mainJ+1):J,1:mainJ] <- hatp10 hatp0[(mainJ+1):J,(mainJ+1):J] <- hatp11 # Pr and psi cannot be evaluated as a closed form # using optim m_est <- optim(c(lpsi0,lpr0,lsz0,lsigma20_lr,lsigma20_lor, v0), logL_, method="L-BFGS-B", control=list(trace=6), lower=c(rep(-Inf, 5),0.00001), upper=c(rep(Inf,6))) lpsi0 <- m_est$par[1] lpr0 <- m_est$par[2] lsz0 <- m_est$par[3] lsigma20_lr <- m_est$par[4] lsigma20_lor <- m_est$par[5] v0 <- m_est$par[6] sigma20_lr <- exp(lsigma20_lr) pr0 <- 1/(1+exp(lpr0)) mu0_lrs <- mu0_lr(J,lpsi0, lpr0, lsz0) mu0_lors <- mu0_lor(J, lpr0, lsz0) a1d <- if(is.na(logor2[1])) {sapply(1:J,function(i) (0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[1]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*r0s[i]) } else { sapply(1:J,function(i) (0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[1]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*exp(-lsigma20_lor)*dchisq(logor2[1]*exp(-lsigma20_lor),df=1, ncp=(mu0_lors[i])^2*exp(-lsigma20_lor))*r0s[i]) } c1 <- 1/sum(a1d) a1h <- c1*a1d akh_all <- matrix(NA, n,J) akh_all[1,] <- a1h ca_all <- numeric(n) ca_all[1] <- c1 for (k in 2:n){ akd <- if(is.na(logor2[k])) {(matrix(a1h, nrow=1)%*%hatp0)*((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))} else { (matrix(a1h, nrow=1)%*%hatp0)*(((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))* (exp(-lsigma20_lor)*dchisq(logor2[k]*exp(-lsigma20_lor),df=1, ncp=mu0_lors^2*exp(-lsigma20_lor))))} cka <- 1/sum(akd) ca_all[k] <- cka akh <- cka*akd akh_all[k,] <- akh a1h <- akh } logL_Y[mm+1] <- sum(sum(-log(ca_all[1:n]))+log(sum(akh_all[n,]))) # convergence criteria # if (abs(logL_Y[mm]-logL_Y[mm+1]) < 0.01) { break } cat(c(logL_Y[mm],logL_Y[mm+1]),"\n") } logl_track <- logL_Y[1:(mm+1)] Sys.time()-watch pr0 <- 1/(1+exp(m_est$par[2])) psi0 <- 2^m_est$par[1] sz0 <- 1/(1+exp(m_est$par[3])) sigma20_lr <- exp(m_est$par[4]) sigma20_lor <- exp(m_est$par[5]) v0 <- m_est$par[6] var_lr <- sigma20_lr*v0/(v0-2) idx_hgtype <- sapply(1:n, function(k) which.max(pzks1[k,])) main_hgtype <- sapply(1:n, function(k) main_geneidx[idx_hgtype[k]]) subc_hidx <- ifelse(idx_hgtype >= (mainJ+1), 1, 0) hat_logr <- hat_logor <- numeric(n) hat_logr[which(subc_hidx==0)] <- mu0_lrs[main_hgtype[which(subc_hidx==0)]] hat_logor[which(subc_hidx==0)] <- mu0_lors[main_hgtype[which(subc_hidx==0)]] chrloc_label <- c(as.vector(tapply(1:n, chrloc_thin$chrom, function(x) round(median(x))))) if(!all(subc_hidx==0)){ subzoneh <- which(subc_hidx==1) subzoneh_spl <- split(subzoneh, cumsum(c(1,diff(subzoneh)!=1 | diff(main_hgtype[subzoneh])!=0))) gen_length <- NULL for(i in 1:length(subzoneh_spl)){ nsubi <- range(subzoneh_spl[[i]]) temp0 <- unlist(tapply(chrloc_thin[nsubi[1]:nsubi[2],2], chrloc_thin[nsubi[1]:nsubi[2],1], function(x) range(x)[2]-range(x)[1])) temp1 <- matrix(c(i, sum(temp0)), ncol=2) gen_length <- rbind(gen_length, temp1) } idx_oksubzone <- gen_length[which(gen_length[,2]>=2.5e+07),1] ok_subzoneh_spl <- sapply(idx_oksubzone, function(x) list(subzoneh_spl[[x]])) nsubr <- length(idx_oksubzone) subc_hidx <- numeric(n) subc_hidx[unlist(ok_subzoneh_spl)] <- 1 if(!all(subc_hidx==0)){ #################################### ## Stage 2 ## # t-fit sz_ests <- matrix(0,nrow=nsubr, ncol=2) for(ss in 1:nsubr){ sub_reg <- ok_subzoneh_spl[[ss]] lsz1 <- lsz_est(sub_reg) sz_ests[ss,] <- matrix(c(1/(1+exp(lsz1)),ss), nrow=1) } ###########conditional on U=1 ####### sub_prob <- matrix(0,nsubr,mainJ+1) for(ss in 1:nsubr){ sub_reg <- ok_subzoneh_spl[[ss]] subA <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "A")+mainJ])/sum(x[(mainJ+1):J]))) # subclone A sub0 <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "0")+mainJ])/sum(x[(mainJ+1):J]))) # subclone 0 subAA <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AA")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AA subAB <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AB")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AB subAAA <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAA")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAA subAAB <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAB")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAB subAAAA <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAAA")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAAA subAABB <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AABB")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AABB subAAAB <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAAB")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAAB subAAAAA <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAAAA")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAAAAA subAAABB <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAABB")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAABB subAAAAB <- mean(apply(pzks1[sub_reg, ], 1, function(x) sum(x[which(sub_zk %in% "AAAAB")+mainJ])/sum(x[(mainJ+1):J]))) # subclone AAAAB reg_idc <- matrix(sz_ests[ss,2], ncol=1) sub_prob[ss,] <- matrix(c(reg_idc,sub0,subA, subAA, subAB,subAAA,subAAB, subAAAA,subAABB,subAAAB,subAAAAA,subAAABB,subAAAAB), nrow=1) } colnames(sub_prob) <- c("sub_reg",main_zk) # final copy number profile considering subclone sub_hgtype_dec <- apply(matrix(sub_prob[,2:(mainJ+1)],ncol=mainJ),1, function(x) which.max(x)) sub_hgtype_fin <- numeric(n) sub_hgtype_fin[1:n] <- NA hat_logr <- hat_logor <- numeric(n) hat_logr[which(subc_hidx==0)] <- mu0_lrs[main_hgtype[which(subc_hidx==0)]] hat_logor[which(subc_hidx==0)] <- mu0_lors[main_hgtype[which(subc_hidx==0)]] for(ss in 1:length(ok_subzoneh_spl)){ sub_reg <- ok_subzoneh_spl[[ss]] lszi <- log((1-sz_ests[ss,1])/sz_ests[ss,1]) temp0 <- (tauJ*(unique(main_hgtype[sub_reg])-1)+1):(tauJ*(unique(main_hgtype[sub_reg])-1)+tauJ) state_i <- temp0[which(sub_zk[temp0]==main_zk[sub_hgtype_dec[ss]])]+mainJ hat_logr[sub_reg] <- mu0_lrs[state_i] hat_logor[sub_reg] <- mu0_lors[state_i] sub_hgtype_fin[sub_reg] <- sub_hgtype_dec[ss] } hat_logr <- hat_logor <- numeric(n) hat_logr[which(subc_hidx==0)] <- mu0_lrs[main_hgtype[which(subc_hidx==0)]] hat_logor[which(subc_hidx==0)] <- mu0_lors[main_hgtype[which(subc_hidx==0)]] col_subidx <- rep("dark blue",n) for(ss in 1:length(ok_subzoneh_spl)){ sub_reg <- ok_subzoneh_spl[[ss]] lszi <- log((1-sz_ests[ss,1])/sz_ests[ss,1]) temp0 <- (tauJ*(unique(main_hgtype[sub_reg])-1)+1):(tauJ*(unique(main_hgtype[sub_reg])-1)+tauJ) state_i <- temp0[which(sub_zk[temp0]==main_zk[sub_hgtype_dec[ss]])]+mainJ hat_logr[sub_reg] <- mu0_lr(J, lpsi0, lpr0, lszi)[state_i] hat_logor[sub_reg] <- mu0_lor(J, lpr0, lszi)[state_i] sub_hgtype_fin[sub_reg] <- sub_hgtype_dec[ss] col_subidx[sub_reg] <- "green" } par(mfrow=c(2,1)) plot(1:n, lr, pch=".", col=col_subidx, ylab="logR",xlab="",cex=0.5,xaxt='n') points(1:n,hat_logr, col="red",pch=".") axis(side=1, cex.axis=0.7,at=chrloc_label, labels=c(1:22,"X")) plot(1:n, logor, pch=".",cex=0.5,xaxt='n', col=col_subidx, xlab="",ylab="logOR", ylim=c(min(na.omit(logor)), max(na.omit(logor)))) points(1:n, hat_logor, col="red",pch=".") points(1:n, -hat_logor, col="red",pch=".") axis(side=1,cex.axis=0.7,at=chrloc_label, labels=c(1:22,"X")) par(mfrow=c(3,1)) plot(1:n, main_hgtype,mgp = c(5, 1, 0), pch=".", xaxt='n',ylab="main genotype", ylim=c(1,mainJ), xlab="", yaxt="n",col="red") axis(2, at=1:mainJ, labels=c(main_zk), las=2) axis(side=1,cex.axis=0.8, at=chrloc_label, labels=c(1:22,"X")) plot(1:n, subc_hidx, pch=".", xaxt='n',ylab="subclone existence",xlab="", yaxt="n",col="red") axis(2, at=c(0,1), labels=c("no subclone","subclone")) axis(side=1, cex.axis=0.8,at=chrloc_label, labels=c(1:22,"X")) plot(1:n, sub_hgtype_fin , mgp = c(5, 1, 0),xaxt='n',pch=".", ylab="subclone genotype", yaxt="n", xlab="", ylim=c(1,mainJ),col="red" ) axis(2, at=1:mainJ, labels=c(main_zk), las=2) axis(side=1,cex.axis=0.8, at=chrloc_label, labels=c(1:22,"X")) mainclone.genotype.index <- main_hgtype subclone.ind <- subc_hidx subclone.genotype <- sub_hgtype_fin } else{ par(mfrow=c(2,1)) plot(1:n, lr, pch=".", col="dark blue", ylab="logR",xlab="",cex=0.5,xaxt='n') points(1:n,hat_logr, col="red",pch=".") axis(side=1, cex.axis=0.7,at=chrloc_label, labels=c(1:22,"X")) plot(1:n, logor, pch=".",cex=0.5,xaxt='n', col="dark blue", xlab="",ylab="logOR", ylim=c(min(na.omit(logor)), max(na.omit(logor)))) points(1:n, hat_logor, col="red",pch=".") points(1:n, -hat_logor, col="red",pch=".") axis(side=1,cex.axis=0.7,at=chrloc_label, labels=c(1:22,"X")) par(mfrow=c(2,1)) plot(1:n, main_hgtype,mgp = c(5, 1, 0), pch=".", xaxt='n',ylab="main genotype", ylim=c(1,mainJ), xlab="", yaxt="n",col="red") axis(2, at=1:mainJ, labels=c(main_zk), las=2) axis(side=1,cex.axis=0.8, at=chrloc_label, labels=c(1:22,"X")) plot(1:n, subc_hidx, pch=".", xaxt='n',ylab="subclone existence",xlab="", yaxt="n",col="red") axis(2, at=c(0,1), labels=c("no subclone","subclone")) axis(side=1, cex.axis=0.8,at=chrloc_label, labels=c(1:22,"X")) mainclone.genotype.index <- main_hgtype subclone.ind <- subc_hidx subclone.genotype <- NA } }else{ par(mfrow=c(2,1)) plot(1:n, lr, pch=".", col="dark blue", ylab="logR",xlab="",cex=0.5,xaxt='n') points(1:n,hat_logr, col="red",pch=".") axis(side=1, cex.axis=0.7,at=chrloc_label, labels=c(1:22,"X")) plot(1:n, logor, pch=".",cex=0.5,xaxt='n', col="dark blue", xlab="",ylab="logOR", ylim=c(min(na.omit(logor)), max(na.omit(logor)))) points(1:n, hat_logor, col="red",pch=".") points(1:n, -hat_logor, col="red",pch=".") axis(side=1,cex.axis=0.7,at=chrloc_label, labels=c(1:22,"X")) par(mfrow=c(2,1)) plot(1:n, main_hgtype,mgp = c(5, 1, 0), pch=".", xaxt='n',ylab="main genotype", ylim=c(1,mainJ), xlab="", yaxt="n",col="red") axis(2, at=1:mainJ, labels=c(main_zk), las=2) axis(side=1,cex.axis=0.8, at=chrloc_label, labels=c(1:22,"X")) plot(1:n, subc_hidx, pch=".", xaxt='n',ylab="subclone existence",xlab="", yaxt="n",col="red") axis(2, at=c(0,1), labels=c("no subclone","subclone")) axis(side=1, cex.axis=0.8,at=chrloc_label, labels=c(1:22,"X")) mainclone.genotype.index <- main_hgtype subclone.ind <- subc_hidx subclone.genotype <- NA } ##################################################### ### Asymptotic SE for global estimators - stage 1 parameters library(numDeriv) hes_logL1 <- function(parm){ lpsi0 <- parm[1] lpr0 <- parm[2] lsigma20_lr <- parm[3] lsigma20_lor <- parm[4] v0 <- parm[5] sigma20_lr <- exp(lsigma20_lr) pr0 <- 1/(1+exp(lpr0)) mu0_lrs <- mu0_lr(J,lpsi0, lpr0, lsz0) mu0_lors <- mu0_lor(J, lpr0, lsz0) a1d <- if(is.na(logor2[1])) {sapply(1:J,function(i) (0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[1]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*r0s[i]) } else { sapply(1:J,function(i) (0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[1]-mu0_lrs[i])^2/(2*sigma20_lr)))^((v0+1)/2))*exp(-lsigma20_lor)*dchisq(logor2[1]*exp(-lsigma20_lor),df=1, ncp=(mu0_lors[i])^2*exp(-lsigma20_lor))*r0s[i]) } c1 <- 1/sum(a1d) a1h <- c1*a1d akh_all <- matrix(NA, n,J) akh_all[1,] <- a1h ca_all <- numeric(n) ca_all[1] <- c1 for (k in 2:n){ akd <- if(is.na(logor2[k])) {(matrix(a1h, nrow=1)%*%hatp0)*((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))} else { (matrix(a1h, nrow=1)%*%hatp0)*(((0.5*v0)^(0.5*v0)*gamma((v0+1)/2)*(2*pi*sigma20_lr)^-0.5/(gamma(v0/2)*(0.5*v0+((lr[k]-mu0_lrs)^2/(2*sigma20_lr)))^((v0+1)/2)))* (exp(-lsigma20_lor)*dchisq(logor2[k]*exp(-lsigma20_lor),df=1, ncp=mu0_lors^2*exp(-lsigma20_lor))))} cka <- 1/sum(akd) ca_all[k] <- cka akh <- cka*akd akh_all[k,] <- akh a1h <- akh } logL_Y <- sum(sum(-log(ca_all[1:n]))+log(sum(akh_all[n,]))) return(logL_Y) } theta1 <- c(lpsi0,lpr0,lsigma20_lr,lsigma20_lor,v0) hes_theta1 <- hessian(hes_logL1, theta1) var_theta1 <- solve(-hes_theta1) theta1_dev <- diag(c(2^lpsi0*log(2), -(1+exp(lpr0))^-2*exp(lpr0), exp(lsigma20_lr),exp(lsigma20_lor),1)) var_theta <- t(theta1_dev)%*%var_theta1%*%theta1_dev se_theta <- sqrt(diag(var_theta)) # asymtotic standard errors of the global estimators given data and lsz0 var1_s2lr <- diag(var_theta)[3] var1_s2v <- diag(var_theta)[5] asymvar1_lr <- 1/(v0-2)^2*((v0^2*var1_s2lr)-4*v0/(v0-2)*exp(lsigma20_lr)*var_theta[3,5]+4/((v0-2))^2*(exp(lsigma20_lr))^2*var1_s2v) asymse_varlr <- sqrt(asymvar1_lr) file_name <- paste("res_TCGA-KL-8331_subHMM.RData", sep="") save(mainclone.genotype.index,subclone.ind,subclone.genotype, m_est, r0,u0,r0s,hatp0_m,logl_track, hatp0,hatp0_M,hatp0_t0,mainJ, tauJ,J,n,hes_theta1, se_theta,var_lr,asymse_varlr, file=file_name)