################################################################################# ################################################################################# # R functions associated with # "Smoothing Parameter Selection for a Class of Semiparametric Linear Models" # by Philip T. Reiss and R. Todd Ogden, to appear in JRSS-B # September 9, 2008 # # These functions can be used to fit *some* of the models described in the paper. # # This code is distributed freely with no warranties of any kind. It has not # been checked for bugs as thoroughly as we would have liked. If you use the # code, we'd very much appreciate hearing from you about it. Please contact the # first author at phil.reiss@nyumc.org for further information or code (e.g., # to use GCV, rather than REML, for smoothing parameter selection), to # provide feedback, or to report problems. ################################################################################# ################################################################################# ################################################################################# # get.data ################################################################################# # Input signal data sets such as the near-infrared spectroscopy data sets # referred to in the paper. The latter are available at Phil Hopke's ftp site, # ftp:// ftp.clarkson.edu/pub/hopkepk/Chemdata/Kalivas/ # # Arguments: # respfile file name for the text file of scalar responses (n x N) # specfile file name for the text file of signal predictors (vector of length n) # s.index values (e.g., wavelengths for NIR spectra) indexing the signals. # dif logical indicating whether to take first differences of the signals # plotit logical indicating whether to plot the signals # demean logical indicating whether to subtract the mean from each column of the signals # ylabel, labels for the plots, with defaults appropriate for the data sets above # xlabel # # Value: a list with components # y responses # s matrix of signals (dimension = number of signals x signal length) # s.index same as argument s.index # # Example: # datalist <- get.data('octane.asc', 'gasspec.asc', 2*(450:850)) # inputs the gasoline data, assuming the two data files are in the working directory get.data <- function(respfile, specfile, s.index, dif=FALSE, plotit=TRUE, demean=TRUE, ylabel = if (dif) 'Differenced log(1/R)' else 'log(1/R)', xlabel='Wavelength (nm)') { y = as.matrix(read.table(respfile)) raw.signal <- as.matrix(read.table(specfile)) s <- if (dif) t(diff(t(raw.signal))) else raw.signal if (plotit) { matplot(s.index, t(s), type='l', xlab=xlabel, ylab=ylabel, col=1) } s <- scale(s, center=demean, scale=F) list(y=y, s=s, s.index=s.index) } ################################################################################# ################################################################################# # fpcr ################################################################################# # Run functional principal component regression (Reiss and Ogden, JASA, 2007) # with number of components chosen (if necessary) by multifold cross-validation # and roughness penalty parameter chosen by REML # # Note: the function checks for the existence of appropriate objects 'basisobj' and # 'halfpen', created, respectively, by the functions make.basis and make.halfpen # (see below). If these do not exist they are created (in the global environment). # If you would like to experiment with different B-spline bases or penalties, # use those functions to create new objects with these names. Alternatively, # remove or rename the objects in question, and they will be re-created by fpcr. # # Arguments: # yy response vector # signals a matrix of signals with mean-zero columns (this will be checked) # nc number of principal components, or a vector of possible numbers of components; # the value 0 yields the full penalized B-spline expansion (no reduction to PCs) # nint number of intervals for B-spline basis; see nint argument in function make.basis # (by default, 40, but this may be higher than needed) # nfold 'k' for k-fold cross-validation # givenlam fixed value to which the the smoothing parameter is set (rather than choosing it # by REML) # s.index values (e.g., wavelengths for NIR spectra) indexing the signals. This is needed only if # the B-spline basis object basisobj does not exist (in which case basisobj is created # by a call to make.basis) # plotit logical: should the coefficient function estimate be plotted? # pentype argument passed to function make.halfpen if the matrix halfpen does not exist in the global # environment (in which case halfpen is created by a call to make.halfpen) # cv1 logical: if nc has length 1, should multifold cross-validation be performed to estimate # prediction error? (If nc has length > 1, this argument is ignored, and CV is # performed to choose the number of components) # # Value: a list with components # fhat coefficient function estimate # nc number of components for the final model # nint same as argument nint # lambda smoothing parameter estimate # sumsspred prediction error estimate for each number of components considered, if CV was performed # df effective degrees of freedom for the model # intercept model intercept # fitted fitted values for model # # Example: choose among 25-, 30-, and 35-component models for the gasoline data by 5-fold cross-validation # datalist = get.data('octane.asc', 'gasspec.asc', 2*(450:850)) # fpcrmod = fpcr(datalist$y, datalist$s, c(25,30,35), nfold=5, s.index=datalist$s.index) fpcr <- function(yy, signals, nc, nint=40, nfold=NULL, givenlam=NULL, s.index=NULL, plotit=FALSE, pentype='deriv2', cv1=F) { require(splines) require(MASS) # Center the signals signals = scale(signals, scale=F) if (!exists('basisobj')) { if (is.null(s.index)) stop('Must set s.index argument in order to define B-spline basis') assign('basisobj', make.basis(s.index = s.index, nint), env=.GlobalEnv) } else if (basisobj$nb != nint + 3) stop("Number of intervals differs from that for existing B-spline basis object 'basisobj'; please remove or rename it before proceeding") bspline.basis = basisobj$bspline.basis knots = basisobj$knots nbasis = basisobj$nbasis s.index = basisobj$s.index if (!exists('halfpen') || ncol(halfpen)!=nbasis) assign('halfpen', make.halfpen(pentype, s.index = s.index, knots, nbasis), env=.GlobalEnv) if (max(nc) > nrow(halfpen)) stop(paste('Maximum number of components for the given basis is', nrow(halfpen))) if (((length(nc) > 1) || (cv1)) && is.null(nfold)) stop('Must set nfold argument to specify k for k-fold cross-validation') if (!is.null(nfold) && max(nc) > 0) { vv <- array(0,c(nbasis,max(nc),nfold)) pvv <- array(0,c(nrow(halfpen),dim(vv)[[2]],nfold)) rrr <- array(0,c(dim(vv)[[2]],dim(vv)[[2]],nfold)) for (fold in 1:nfold) { trainers <- (1:length(yy))[(1:length(yy))%%nfold!=(fold-1)] sbtrain <- scale(signals[trainers,], scale=F) %*% bspline.basis vv[,,fold] <- svd(sbtrain)$v[,1:max(nc)] pvv[,,fold] <- halfpen %*% vv[,,fold] rrr[,,fold] <- qr.R(qr(pvv[,,fold])) } } ################################################# # Find optimal number of components ################################################# sumsspred <- rep(NA,length(nc)) names(sumsspred) <- nc if (length(nc)==1 && cv1==F) bestnc <- nc # but if cv1==T, calculate CV (next section of code) even if length(nc)==1 else { for (cc in 1:length(nc)) { cat(nc[cc],'components\n') sspred <- rep(NA,nfold) for (fold in 1:nfold) { trainers <- (1:length(yy))[(1:length(yy))%%nfold!=(fold-1)] ytrain <- yy[trainers] yval <- yy[-trainers] straw <- signals[trainers,] strain <- scale(straw, scale=F) sbtrain <- strain %*% bspline.basis svraw <- signals[-trainers,] sval <- svraw + (strain-straw)[1:nrow(svraw),] if (nc[cc]==0) {x <- bspline.basis %*% Null(t(halfpen)) xx <- cbind(rep(1,length(ytrain)), strain %*% x) z <- bspline.basis %*% ginv(halfpen)} else {x <- NULL xx <- matrix(1,length(trainers),1) z <- bspline.basis %*% vv[,1:nc[cc],fold] %*% solve(rrr[1:nc[cc],1:nc[cc],fold])} zz <- strain %*% z ########################################## # Given that number of components = nc[cc], estimate SS.pred by cross-validation bestlam <- if (!is.null(givenlam)) givenlam else penmod(ytrain, x, xx, z, zz)$lam mm <- penmod(ytrain, x, xx, z, zz, bestlam) sspred[fold] <- crossprod(yval - mm$int - sval %*% mm$fhat) ########################################## } # fold loop # Add up SS.pred for each validation set sumsspred[cc] <- sum(sspred) } # cc loop cat('SS.pred for each number of components:\n') print(sumsspred) bestnc <- nc[which.min(sumsspred)] cat('Chosen number of components is', bestnc, '\n') } # end of 'else' clause (--> choice of number of components ends here) ################################################# # Fit model given optimal number of components ################################################# # Create design matrices if (bestnc==0) {x <- bspline.basis %*% Null(t(halfpen)) xx <- cbind(rep(1,length(yy)), signals %*% x) z <- bspline.basis %*% ginv(halfpen)} else { sb.all <- signals %*% bspline.basis vv.all <- as.matrix(svd(sb.all)$v[,1:bestnc]) pvv.all <- halfpen %*% vv.all rrr.all <- qr.R(qr(pvv.all)) x <- NULL xx <- matrix(1,length(yy),1) z <- bspline.basis %*% vv.all %*% solve(rrr.all) } zz <- signals %*% z modfit <- penmod(yy, x, xx, z, zz, if (!is.null(givenlam)) givenlam else NULL) if (plotit==TRUE) { if (length(nc)==1 && cv1==T) lines(s.index, modfit$fhat, lwd=4) else plot(s.index, modfit$fhat,type='l') } ztz <- crossprod(zz) df <- try(sum(diag(solve(ztz + diag(as.numeric(modfit$lambda), nrow(ztz)), ztz)))) list(fhat=modfit$fhat, nc=bestnc, nint=nint, lambda=modfit$lambda, sumsspred=sumsspred, df=df, intercept=modfit$intercept, fitted=modfit$intercept + signals %*% modfit$fhat) } ################################################################################# ################################################################################# # make.basis ################################################################################# # Create a cubic B-spline basis # (thanks to Paul Eilers & Brian Marx, on whose code this is loosely based) # # Arguments: # s.index locations at which B-spline functions are evaluated (corresponding to wavelengths for NIR # spectra) # nint number of intervals between equally-spaced knots from, from min(s.index) to max(s.index) # # Value: a list with components # bspline.basis length(s.index) x (nint+3) matrix, each column of which gives the values of a B-spline # basis function at the locations given by s.index (nint+3 is the number of cubic # B-splines in a basis defining nint internal intervals) # knots equally spaced knot values (nint+7 of them, including external knots) # nbasis number of basis functions (equal to nint+3) # s.index same as argument s.index make.basis <- function(s.index, nint) { smin <- min(s.index) smax <- max(s.index) ds <- (smax - smin)/nint knots <- seq(smin - 3 * ds, smax + 3 * ds, by = ds) bspline.basis <- splineDesign(knots, s.index, 4, 0 * s.index) nbasis <- ncol(bspline.basis) cat('Creating B-spline basis\n') list(bspline.basis=bspline.basis, knots=knots, nbasis=nbasis, s.index=s.index) } ################################################################################# ################################################################################# # make.halfpen ################################################################################# # Create a matrix used to define the roughness penalty # # Arguments: # pentype type of roughness penalty: may be either 'deriv2' for squared second derivative penalty, # or paste('diff', k, sep='') for k-th difference penalty as in Marx & Eilers' # P-splines (e.g. 'diff2' for second difference) # s.index locations at which B-spline functions are evaluated (corresponding to wavelengths for NIR # spectra) # knots equally spaced knot values # nbasis number of basis functions # # Value: an (nbasis-k) x nbasis matrix M, where k is the degree of the penalty, such that # M'M is the roughness penalty make.halfpen <- function(pentype, s.index, knots, nbasis) { if (pentype!='deriv2' && substr(pentype,1,4)!='diff') stop("pentype must be 'deriv2' or 'diff1', 'diff2', etc.") if (pentype=='deriv2') halfpen <- qr.R(qr(splineDesign(knots, s.index, 4, rep(2, length(s.index)))))[1:(nbasis-2), ] else if (substr(pentype,1,4)=='diff') halfpen <- diff(diag(nbasis), diff=as.numeric(substr(pentype, nchar(pentype), nchar(pentype)))) cat('Creating penalty matrix\n') halfpen } ################################################################################# ################################################################################# # penmod ################################################################################# # Fit a signal regression model with roughness penalty # # If the smoothing parameter lambda is not supplied, it is estimated by REML # using function lme from package nlme. # Either way the coefficients are divided into "fixed" and "random" effects. # This is an internal function called by fpcr. penmod <- function(y0, x0, xx0, z0, zz0, lambda=NULL) { nfix <- ncol(xx0) # Find lambda and fhat by REML if (is.null(lambda)) { require(nlme) lme.gp <- rep(1,length(y0)) lmefit <- lme(y0~-1+xx0,random=list(lme.gp=pdIdent(~-1+zz0))) lambda <- exp(-2*unlist(lmefit$modelStruct)) fixx <- lmefit$coef$fix rndm <- t(lmefit$coef$random$lme.gp) if (nfix > 1) fhat <- x0 %*% lmefit$coef$fix[-1] + z0 %*% t(lmefit$coef$random$lme.gp) else fhat <- z0 %*% t(lmefit$coef$random$lme.gp) } # Given lambda, find fhat else { CC <- cbind(xx0,zz0) inner <- crossprod(CC) diag(inner)[-(1:nfix)] <- diag(inner)[-(1:nfix)] + lambda blueblup <- solve(inner, t(CC) %*% y0) fixx <- blueblup[(1:nfix), 1] rndm <- blueblup[-(1:nfix), 1] fixedpart <- 0 if (nfix > 1) fixedpart <- x0 %*% as.matrix(fixx)[-1,] fhat <- fixedpart + z0 %*% rndm } list(lambda=lambda, fhat=fhat, intercept=mean(y0), fixx=fixx, rndm=rndm) }