########################################################################## # Free vs. Transcribed -- Classifier Evaluation # # # # run.R # # # # by: Kevin Killourhy # # date: August 23, 2012 # ########################################################################## ########################################################################## # This script reproduces the results of the paper: ## # K.S. Killourhy and R.A. Maxion. "Free vs. Transcribed Text for # Keystroke-Dynamics Evaluations," in LASER'12, Washington, DC, # July 18-19, 2012, ACM Press. ## # The script has been shared in order to promote scientific progress, # and to encourage other researchers to reproduce and expand upon our # results. ## # Instructions to reproduce our results: # - Run R (v 2.13.1) in the parent directory of this script. # - Execute the following instructions: # source( 'r/run.R' ); # run(); # - Wait for the function to complete. # - Confirm that the script printed the proper proportion of # significant statistical tests (i.e., 28.9% and 3.6%). # - Confirm that the figs directory contains reproductions of # Figures 2 and 3 from the paper. # - Explore the data and code to understand the calculations and # analysis that produced the figures and results. ####################################################################### library( reshape ); library( plyr ); library( lattice ); ########################################################################## # Configuration figsfile <- function(fname) sprintf('figs/%s', fname); datafiles <- list(Hold = 'data/TimingFeatures-Hold.txt', DD = 'data/TimingFeatures-DD.txt'); sessionfile <- 'data/SessionMap.txt'; groupmap <- list('Group 1' = c('Sea-Trans','Runaway-Free','Girl-Trans','Doll-Free'), 'Group 2' = c('Sea-Free','Runaway-Trans','Girl-Free','Doll-Trans'), 'Group 3' = c('Runaway-Trans','Sea-Free','Doll-Trans','Girl-Free'), 'Group 4' = c('Runaway-Free','Sea-Trans','Doll-Free','Girl-Trans')); groupmtx <- sapply( groupmap, identity ); ########################################################################## # Data Manipulation Functions ### # This function identifies the group to which a subject belongs based # on the first session (in the session map). ### # user: list of users ## # S: the mapping from the session-map file ### # groups: the list of groups returned (same length as user) ### getusergroup <- function( user, S ) { S <- transform( S, value = sub('.* - (.*) - (.*)', '\\1-\\2', value ) ); tbl <- cast( S, user ~ sessionIndex ); groupnames <- colnames( groupmtx ); groupses1 <- groupmtx[ 1, ]; gidxs <- match( as.character( tbl[['1']] ), groupses1 ); tbl$group <- groupnames[ gidxs ]; for( i in 2:4 ) { groupsesi <- groupmtx[ i, ]; gidxs <- match( as.character( tbl[[sprintf('%d',i)]] ), groupsesi ); stopifnot( identical( tbl$group, groupnames[ gidxs ] ) ); } uidxs <- match( user, tbl$user ); stopifnot( all( is.na( uidxs ) == FALSE ) ); groups <- factor( tbl$group[ uidxs ], levels=groupnames ); stopifnot( all( is.na( groups ) == FALSE ) ); return( groups ); } ### # This function takes a block level (i.e., a count of how many # blocks to include in the data set) and retrieves the data from the # appropriate data files ### # blockidx: the number of blocks (exercises) to include ## # S: the mapping from the session-map file ### # Z: the keystroke-timing data ### getdata <- function( blockidx, S ) { Z <- ldply( names( datafiles ), function( dataname ) { Zi <- read.table( datafiles[[ dataname ]], header=TRUE ); Zi$blockIndex <- Zi$screenIndex - 2; Zi <- subset( Zi, blockIndex <= blockidx ); if( dataname == 'Hold' ) { with( Zi, data.frame( subject = subject, sessionIndex = sessionIndex, feature = sprintf('H.%s', key), time = time ) ); } else if( dataname == 'DD' ) { with( Zi, data.frame( subject = subject, sessionIndex = sessionIndex, feature = sprintf('DD.%s.%s', key1, key2), time = time ) ); } else { stop("Unknown dataname ",dataname); } } ); Z$group <- getusergroup( Z$subject, S ); Z$sessionName <- factor( groupmtx[ cbind( Z$sessionIndex, Z$group ) ] ); Z$type <- factor(sub( '^([A-Za-z]+)\\..*$', '\\1', as.character(Z$feature) )); Z$isfree <- grepl( 'Free', as.character( Z$sessionName ) ); return( Z ); } ########################################################################## # Classifier Functions ### # Compare two typing samples using the likelihood ratio testing # procedure designed by Gaines et al (1980). ### # Xdf: a data frame containing the timing features from one session, # designated to come from a known user (e.g., the genuine user); must # have columns named 'feature' and 'time'. ## # Ydf: a data frame containing the timing features from one session, # designated to have come from an unidentified user (e.g., the test # user); must have columns named 'feature' and 'time'. ## # repmin: the minimum number of repetitions necessary for a feature to # be included in the analysis ## # rmax: the number of distinct features to use. Based on this, the # number of samples will be determined based on the number of samples # in the r-th most frequent feature. ### # score: the sample anomaly score (a dissimilarity measure). ### statclassifier <- function( Xdf, Ydf, repmin=5, rmax=Inf ) { # Find the set of features for which both X and Y have more than 1 sample xcount <- table( Xdf$feature ); ycount <- table( Ydf$feature ); xnames <- names( xcount )[ xcount >= repmin ]; ynames <- names( ycount )[ ycount >= repmin ]; fnames <- intersect( xnames, ynames ); stopifnot( length( fnames ) >= 2 ); # Select the most prevalent features if we have more than rmax if( length( fnames ) > rmax ) { xycount <- table( unlist( list( Xdf$feature, Ydf$feature ) ) ); fnames <- names( sort( xycount[ fnames ], decreasing = TRUE ) )[ 1:rmax ]; } # Set r, M, and N r <- length( fnames ); M <- min( xcount[ fnames ] ); N <- min( ycount[ fnames ] ); stopifnot( M >= repmin && N >= repmin ); # Filter out the unused features and convert to melted column naming Xm <- ddply( Xdf, .(feature), function( Xdfi ) { if( as.character( unique( Xdfi$feature ) ) %in% fnames ) { data.frame(feature = Xdfi$feature[1:M], index = 1:M, value = Xdfi$time[1:M] ); } else { return( NULL ); } } ); Ym <- ddply( Ydf, .(feature), function( Ydfi ) { if( as.character( unique( Ydfi$feature ) ) %in% fnames ) { data.frame(feature = Ydfi$feature[1:N], index = 1:N, value = Ydfi$time[1:N] ); } else { NULL; } } ); # Reshape and convert to a matrix of features Xc <- cast( Xm, index ~ feature )[ , -1 ]; Yc <- cast( Ym, index ~ feature )[ , -1 ]; X <- as.matrix( Xc ); colnames( X ) <- names( Xc ); Y <- as.matrix( Yc ); colnames( Y ) <- names( Yc ); # Calculate the test statistic, derived from Gaines et al. appendix xbar <- colMeans( X ); ybar <- colMeans( Y ); ssx <- colSums( sweep( X, 2, xbar )^2 ); ssy <- colSums( sweep( Y, 2, ybar )^2 ); U <- sum( log( 1 + ( xbar - ybar )^2 / ( (1/M+1/N) * ( ssx + ssy ) ) ) ); score <- U / r; names( score ) <- 'score'; return( score ); } ### # Disorder-based similarity metric used by Bergadano et al. ### ### # Xdf: a data frame containing the timing features from one session, # designated to come from a known user (e.g., the genuine user); must # have columns named 'feature' and 'time'. ## # Ydf: a data frame containing the timing features from one session, # designated to have come from an unidentified user (e.g., the test # user); must have columns named 'feature' and 'time'. ## # repmin: the minimum number of repetitions necessary for a feature to # be included in the analysis ### # score: the sample anomaly score (a dissimilarity measure). ### disorderclassifier <- function( Xdf, Ydf, repmin=5 ) { # Find the features shared by both data sets and filter the rest xcount <- table( Xdf$feature ); ycount <- table( Ydf$feature ); xnames <- names( xcount )[ xcount >= repmin ]; ynames <- names( ycount )[ ycount >= repmin ]; fnames <- intersect( xnames, ynames ); stopifnot( length( fnames ) >= 2 ); Xdf <- droplevels( subset( Xdf, feature %in% fnames ) ); Ydf <- droplevels( subset( Ydf, feature %in% fnames ) ); # Find the average timing for each feature xs <- daply( Xdf, .(feature), function( Xi ) mean( Xi$time ) ); ys <- daply( Ydf, .(feature), function( Yi ) mean( Yi$time ) ); # Arrange the features in sorted order xs <- sort( xs, decreasing=TRUE ); ys <- sort( ys, decreasing=TRUE ); # Extract the list of features, sorted from most to least frequent xrank <- names( xs ); yrank <- names( ys ); # Compare the orderings (basically find a permutation from yrank to xrank) sorder <- match( yrank, xrank ); # Calculate the numeric degree of disorder n <- length( sorder ); d <- sum( abs( sorder - 1:n ) ); # Normalize to a score between 0 and 1 score <- d / ifelse( n %% 2 == 1, ( n^2 - 1 )/2, n^2/2 ); return( list( score=score ) ); } ########################################################################## # Evaluation Functions ### # This function takes a Z matrix for a bunch of users who are all # assumed to be in the same group. Based on the sesname, it # identifies the training and test session for the free task and the # training and test session for the transcription task. For each, and # for each combination of training and test user, it runs the # classifier and records the score. It returns a data frame with the # training subject, testing subject, free/trans indicator, and score. ### # Z: the keystroke-timing data ## # classifierfunc: either of the classifier comparison functions ### # R: data frame organizing scores for every pairing of user and # impostor subject within each group. ## # R columns: group, isfree, trainuser, testuser, score. ### evaluategroup <- function( Z, classifierfunc ) { group <- as.character( unique( Z$group ) ); stopifnot( Z$sessionIndex %in% 1:4 ); stopifnot( length( group ) == 1 ); trainmask <- Z$sessionIndex <= 2; freemask <- grepl( 'Free', Z$sessionName ); subjs <- as.character( sort( unique( Z$subject ) ) ); E <- expand.grid(isfree = c(FALSE,TRUE), trainuser = subjs, testuser = subjs); R <- adply( E, 1, function( Ei ) { Xdf <- subset( Z, subject == as.character( Ei$trainuser ) & trainmask == TRUE & freemask == Ei$isfree ); Ydf <- subset( Z, subject == as.character( Ei$testuser ) & trainmask == FALSE & freemask == Ei$isfree ); score <- classifierfunc( Xdf, Ydf ); data.frame( group = group, Ei, score=score ); }, .progress='text' ); return( R ); } ### # This function generates an experiment matrix for use in scripting a # systematic run through all the across-group experimental # combinations of factors (the within-group factors must be handled at # a lower level). We focus on the Hold-plus-Log(DD) featureset since # it produced the best results in preliminary trials. We filter out # blockcount == 1 since there are too few digraphs to meet the repmin # constraints. ### ### # A: Design matrix (really a data frame) ## # A columns: classifier, featureset, blockcount ### getdesign <- function() { A <- expand.grid(classifier = c('Statistical','DisorderBased'), featureset = c('Hold+LogDD'), blockcount = 1:8); A <- subset( A, blockcount > 1 | featureset == 'Hold' ); return( A ); } ### # Extract a subset of the data based on the featureset ### # Z: the keystroke-timing data ## # featureset: the label for the features to include (plus transformations) ## # repmin: the minimum number of repetitions necessary for a feature to # avoid being filtered out. ## # eps: the epsilon offset that we add to zero times to avoid taking # the log of zero. ### # Z: the filtered keystroke-timing data ### filterfeatures <- function( Z, featureset, repmin=0, eps=.Machine$double.eps ) { holdpat <- '^H\\.[a-z]$'; ddpat <- '^DD\\.[a-z]\\.[a-z]$'; udpat <- '^UD\\.[a-z]\\.[a-z]$'; if( featureset == 'Hold' ) { Z <- droplevels( subset( Z, grepl( holdpat, feature ) ) ); } else if( featureset == 'DD' ) { Z <- droplevels( subset( Z, grepl( ddpat, feature ) ) ); } else if( featureset == 'Hold+DD' ) { Z <- droplevels( subset( Z, grepl( holdpat, feature ) | grepl( ddpat, feature ) ) ); } else if( featureset == 'LogDD' ) { Z <- droplevels( subset( Z, grepl( ddpat, feature ) ) ); ltime <- with( Z, log( ifelse( time == 0, time + eps, time ) ) ); Z <- transform( Z, time = ltime ); levels( Z$feature ) <- sub( '^DD\\.', 'LogDD\\.', levels( Z$feature ) ); } else if( featureset == 'Hold+LogDD' ) { Z <- droplevels( subset( Z, grepl( holdpat, feature ) | grepl( ddpat, feature ) ) ); ltime <- with( Z, log( ifelse( time == 0, time + eps, time ) ) ); Z <- transform( Z, time = ifelse( grepl( '^DD', feature ), ltime, time ) ); levels( Z$feature ) <- sub( '^DD\\.', 'LogDD\\.', levels( Z$feature ) ); } else if( featureset == 'THE' ) { Z <- droplevels( subset( Z, feature %in% c('H.t','H.h','H.e', 'DD.t.h','DD.h.e') ) ); } else { stop("Unsupported featureset ",featureset); } if( repmin > 0 ) { tbl <- xtabs( ~ subject + sessionIndex + feature, Z ); mincount <- apply( tbl, 3, min ); fnames <- names( mincount )[ mincount >= repmin ]; Z <- droplevels( subset( Z, feature %in% fnames ) ); } return( Z ); } ### # This function takes a row of the design matrix A, loads the # necessary data, preprocesses it, and calls evaluategroup on each # group in the data set. It concatenates and returns the results. ### # Ai: row of the design matrix A ## # S: the mapping from the session-map file ### # R: data frame of evaluation results ## # R columns: group, isfree, trainuser, testuser, score. ### runevaluation <- function( Ai, S ) { print( Ai ); classifier <- as.character( Ai$classifier ); featureset <- as.character( Ai$featureset ); blockcount <- Ai$blockcount; # Look up the classifier if( classifier == 'Statistical' ) { classifierfunc <- statclassifier; } else if( classifier == 'DisorderBased' ) { classifierfunc <- disorderclassifier; } else { stop("Unsupported classifier",classifier); } stopifnot( ! is.null( classifierfunc ) ); # Load the data Z <- getdata( blockcount, S ); # Run a low-pass filter and a rounding Z <- subset( Z, time <= 0.500 ); Z <- transform( Z, ifelse( time < 0.001, 0.001, time ) ); # Filter out the unused features Z <- filterfeatures( Z, featureset ); # Run the evaluation R <- ddply( Z, .(group), evaluategroup, classifierfunc = classifierfunc ); # Print the zero-false-alarm miss rate rsum <- ddply( R, .(isfree), function( Ri ) { R1 <- subset( Ri, trainuser == testuser ); R2 <- subset( Ri, trainuser != testuser ); data.frame( value = mean( R2$score <= max( R1$score ) ) ); } ); print( cast( rsum, . ~ isfree ) ); cat("\n\n"); return( R ); } ########################################################################## # Data Analysis Functions ### # Find a threshold that achieves a target empirical false alarm rate, # and add threshold, alarm, and error columns to the results data # frame. (Meant to be called once per set of tunable factors.) ### # R: data frame of results ## # alpha: false alarm rate to target (as a nearest upper bound) ### # R: data frame of results with threshold, alarm, and error columns # added. The threshold is the threshold chosen for that result. # Alarm is a binary indicating whether the anomaly score exceeds the # threshold. Error is a binary indicating whether the alarm (or lack # thereof) is in error. ### # R columns: group, isfree, tranuser, testuser, score, thresh, alarm, error ### tunethreshold <- function( R, alpha=.05 ) { R1 <- droplevels( subset( R, trainuser == testuser ) ); thresh <- quantile( R1$score, 1 - alpha, type=1 ); stopifnot( mean( R1$score > thresh ) <= alpha ); R <- transform( R, thresh = thresh, alarm = score > thresh ); R <- transform( R, error = xor( isimp, alarm ) ); return( R ); } ### # Conduct a series of t tests (or other one-way, two-sample, unpaired # test procedure) to find a significant differences. ### # Z: the keystroke-timing data ## # gvarnames: names of grouping columns (over which separate tests are performed) ## # tvarname: name of test column (grouping variable for test comparison) ## # valname: name of value column (column containing values to be tested) ## # do.prop: boolean indicator that the t-test should be replaced by a prop test ## # TEST: actual test procedure (e.g., t.test or prop.test) ### # P: data frame of test results ## # P columns (if do.prop == FALSE): the gvarnames, diff, p.value. The # diff is the estimated effect size and p.value is the p value. ## # P columns (if do.prop == TRUE): the gvarnames, p1, p2, and p.value. # The p1 and p2 are the estimated proportions in the first and second # groups. ### ddplytests <- function( Z, gvarnames, tvarname, valname, do.prop = FALSE, TEST = if( do.prop ) prop.test else t.test, ... ) { P <- ddply( Z, gvarnames, function( Zi ) { stopifnot( is.logical( Zi[[tvarname]] ) ); if( do.prop ) { stopifnot( length( valname ) == 2 ); xs <- tapply( Zi[[ valname[1] ]], Zi[[ tvarname ]], sum ); ns <- tapply( Zi[[ valname[2] ]], Zi[[ tvarname ]], sum ); tobj <- TEST( x=xs, n=ns, ... ); pval <- tobj[['p.value']]; if( xs[[1]] / ns[[1]] == xs[[2]] / ns[[2]] ) pval <- 1.0; data.frame(p1 = xs[[1]] / ns[[1]], p2 = xs[[2]] / ns[[2]], p.value = pval); } else { stopifnot( length( valname ) == 1 ); xs <- Zi[[valname]][ Zi[[tvarname]] ]; ys <- Zi[[valname]][ ! Zi[[tvarname]] ]; tobj <- TEST( x=xs, y=ys, ... ); data.frame(diff = mean(xs) - mean(ys), p.value = tobj[['p.value']] ); } } ); return( P ); } ########################################################################## # Main Function run <- function() { ######################################################################## # 1. Read and preprocess the data ######################################################################## # Read in the session map S <- read.table( sessionfile, header=TRUE ); # Load the full data set and calculate some summary statistics Z8 <- getdata( 8, S ); Z8dd <- filterfeatures( Z8, 'DD', repmin=5 ); Z8h <- filterfeatures( Z8, 'Hold', repmin=5 ); seslen <- ddply( Z8, .(subject,sessionIndex,group,sessionName,isfree), function( Zi ) c(keys = sum( Zi$type == 'H' ), time = sum( Zi$time[ Zi$type == 'DD' ] )) ); ######################################################################## # 2. Run the evaluations ######################################################################## # Run a complete evaluation and tune the threshold A <- getdesign(); R <- adply( A, 1, runevaluation, S=S ); R <- transform( R, isimp = trainuser != testuser, transfirst = group %in% c('Group 1','Group 3'), seatranspic = group %in% c('Group 1','Group 4') ); # Tabulate the error rates for each classifier, group, and blockcount Q <- ddply( R, .(classifier,blockcount,group,isfree), tunethreshold, alpha=0 ); Q <- droplevels( subset( Q, trainuser != testuser ) ); Qcnt <- ddply( Q, .(classifier,blockcount,isfree,group, transfirst,seatranspic,trainuser,testuser), function(Qi) { c( errors = sum( Qi$error[ Qi$trainuser != Qi$testuser ] ), total = sum( Qi$trainuser != Qi$testuser ) ); } ); Qmean <- ddply( Qcnt, .(classifier,blockcount,isfree,group), function( Qi ) c( miss.rate = sum(Qi$errors)/sum(Qi$total) ) ); ######################################################################## # 3. Conduct the statistical tests ######################################################################## # Test for free/trans differences across keystroke timings, classifier # scores, and classifier errors Z8f <- filterfeatures( Z8, 'Hold+LogDD', repmin=5 ); time8tests <- ddplytests( Z8f, c('subject','feature','group'), 'isfree', 'time' ); errortests <- ddplytests( Qcnt, c('classifier','blockcount','group'), 'isfree', c('errors','total'), do.prop=TRUE ); ######################################################################## # 4. Calculate descriptive statistics and plot the results ######################################################################## # Set up the plotting theme theme <- col.whitebg(); groupcols <- c("#0080ff","#ff00ff","darkgreen","#ff0000","orange","#00ff00"); theme$superpose.symbol$col <- groupcols; theme$superpose.line$col <- groupcols; # Tabulate the average session lengths slen <- ddply( seslen, .(isfree), function(Zi) mean( Zi[['time']] ) ); freeavg <- slen$V1[ slen$isfree == TRUE ]; transavg <- slen$V1[ slen$isfree == FALSE ]; # Extract the pvals for each of the tests timepvals <- time8tests[['p.value']]; errorpvals <- errortests[['p.value']]; cat("Significant timing p-values: ",100*mean(timepvals<.05),"%\n"); cat("Significant evaluation p-values: ",100*mean(errorpvals<.05),"%\n"); # Create density plots for the times and scores Z8h <- transform( Z8h, isfree = ifelse( isfree, 'Free', 'Transcribed' ) ); Z8dd <- transform( Z8dd, isfree = ifelse( isfree, 'Free', 'Transcribed' ) ); Z82 <- transform( rbind( Z8h, Z8dd ), type = ifelse( type == 'DD', 'Keydown-Keydown', 'Hold' ) ); hcnt <- nlevels(Z8h[['feature']]); ddcnt <- nlevels(Z8dd[['feature']]); allfcnt <- hcnt+ddcnt; hfree <- round( 1000 * with( Z82, median( time[ type == 'Hold' & isfree == 'Free' ] ) ) ); htrans <- round( 1000 * with( Z82, median( time[ type == 'Hold' & isfree != 'Free' ] ) ) ); hdiff <- htrans - hfree; ddfree <- round( 1000 * with( Z82, median( time[ type == 'Keydown-Keydown' & isfree == 'Free' ] ) ) ); ddtrans <- round( 1000 * with( Z82, median( time[ type == 'Keydown-Keydown' & isfree != 'Free' ] ) ) ); dddiff <- ddtrans - ddfree; R <- transform( R, isfree = ifelse( isfree, 'Free', 'Transcribed' ), isimp = factor( ifelse( !isimp, 'User', 'Impostor' ), levels=c('User','Impostor') ) ); Rstat <- droplevels( subset( R, classifier == 'Statistical' ) ); Rdord <- droplevels( subset( R, classifier == 'DisorderBased' ) ); R <- transform( R, classifier = factor( classifier, labels=c('Statistical Classifier', 'Disorder-Based Classifier') ) ); trellis.device( postscript, file = 'figs/time-density.eps', paper='special', height = 3.5, width = 8, horiz=FALSE, theme=theme ); print( densityplot( ~ time | type, Z82, group = isfree, type = NA, n = 1001, xlab = 'Times (sec)', xlim = c(0,0.5), scale = list( alternating = FALSE ), auto.key = list( space = 'right' ) ) ); graphics.off(); # Create the error plots Qmean <- transform( Qmean, isfree = ifelse( isfree, 'Free', 'Transcribed' ), classifier = factor( classifier, labels=c('Statistical Classifier', 'Disorder-Based Classifier') ) ); Qmean <- ddply( Qmean, .(blockcount,isfree,classifier), summarize, miss.rate = mean( miss.rate ) ); trellis.device( postscript, file = 'figs/error-plot.eps', paper='special', height = 3, width = 8, horiz=FALSE, theme=theme ); print( xyplot( 100 * miss.rate ~ blockcount | classifier, Qmean, group = isfree, auto.key = list( space = 'right' ), scale = list( alternating = FALSE ), between = list( x=1, y=1 ), xlab = 'Number of Exercises (Sample Size)', ylab = 'Miss Rate (%)', type = 'b' ) ); graphics.off(); }