# Common functions for Eclipse Evaluation
# 
# Author: spd
###############################################################################

library(cacheSweave)
library(Hmisc)
library(xtable)
library(lattice)

source('../../../Library/R/tables.R')
source('../../../Library/R/plot.R')

plot_versions_since_origin<-function(ylab,bty,labels){
	r_plot(
		eclipse, 
		150, 
		restrict_age, 
		apply_all(c(
						filter_measure(filter_approach('ta'), measure_f),
						filter_measure(filter_approach('da'), measure_f),
						measure_prop(nrow(fixed)))),
		indexed=zip(col=colours, bg=colours, pch=pch, cex=1.1), 
		common=list(
				xlim=c(0,150),
				ylim=c(0,1),
				xlab='Maximum Number Of Versions Since Introduction',
				ylab=ylab,
				type='l',
				pgap=15,
				bty=bty))
	legend(
		'bottom', 
		c(labels, 'Overall Proportion of Commits'), 
		col=colours,
		pt.bg=colours,
		pt.cex=1.1,
		lty=1, 
		pch=pch,
		inset=0.05,
		bty=bty)

}

plot_max_commits_per_bug<-function(ylab,bty,labels){
	r_plot(
			eclipse, 
			25, 
			restrict_size, 
			apply_all(c(
							filter_measure(filter_approach('ta'), measure_f),
							filter_measure(filter_approach('da'), measure_f),
							measure_prop(nrow(eclipse)))),
			indexed=zip(col=colours, bg=colours, pch=pch, cex=1.1), 
			common=list(
					xlim=c(0,25),
					ylim=c(0,1),
					xlab='Maximum Commits Per Bug',
					ylab=ylab,
					type='l',
					pgap=3,
					bty=bty))
	legend(
			'bottom', 
			c(labels, 'Overall Proportion of Commits'), 
			col=colours,
			pt.bg=colours,
			pt.cex=1.1, 
			lty=1, 
			pch=pch,
			inset=0.05,
			bty=bty)
	
}

bugs_tagged <- function(bugs, ..., invert=FALSE){
	tags<-list(...)
	index<-numeric()
	for (tag in tags) {
		i<-grep(tag, bugs$tags)
		index<-c(index,i)
	}
	
	if(invert) index<-0-index
	
	bugs[index,]
}

evaluate_approaches<-function(bugs, approaches, measures, n=names(approaches)){
	evaluation<-data.frame(row.names=n)
	for (i in seq(along = measures)) {
		f<-measures[[i]]
		name<-names(measures)[i]
		
		evaluation[[name]]<-sapply(approaches, function(x){f(filter_approach(x)(bugs))})
	}
	
	evaluation
}

restrict_size<-function(bugs, x){bugs$bug_size<=x}
restrict_age<-function(bugs, x){!is.na(bugs$age)&bugs$age<=x}
restrict_born<-function(bugs, x){!is.na(bugs$born)&bugs$born<=x}

filter_approach<-function(approach){
	function(bugs){
		n<-function(x){paste(approach,x,sep='_')}
		data.frame(tp=bugs[[n('tp_l')]], fp=bugs[[n('fp_l')]], tn=bugs[[n('tn_l')]], fn=bugs[[n('fn_l')]])
	}
}

measure_values<-function(x){c(sum(x$tp), sum(x$fp), sum(x$fn))}
measure_p<-function(x){sum(x$tp)/(sum(x$tp)+sum(x$fp))}
measure_r<-function(x){sum(x$tp)/(sum(x$tp)+sum(x$fn))}
measure_a<-function(x){(sum(x$tp)+sum(x$tn))/(sum(x$tp)+sum(x$fp)+sum(x$tn)+sum(x$fn))}
measure_f<-function(x){p<-measure_p(x);r<-measure_r(x);2*p*r/(p+r)}
measure_prop<-function(total){function(x){nrow(x)/total}}
measure_tp<-function(x){sum(x$tp)}
measure_fp<-function(x){sum(x$fp)}
measure_fn<-function(x){sum(x$fn)}
filter_measure<-function(filter, measure){function(x){measure(filter(x))}}
apply_all<-function(measures){
	function(x){
		ret<-list()
		for(i in seq(along=measures)){
			ret[[i]]<-measures[[i]](x)
		}
		ret
	}
}

r_plot<-function(bugs,max,restrict,measure,indexed=NA,common=NA){
	v<-sapply(c(1:max), function(x){
		b<-bugs
		if(!is.null(restrict)){
			b<-b[restrict(b, x),]
		}
		measure(b)
	})
	
	args<-common
	args$x<-0
	args$type<-'n'
	do.call(plot, args[names(args)!='pgap'])
	
	if(is.null(nrow(v))){
		args<-c(indexed[[1]],common)
		args$x<-v
		do.call(lines, args[names(args)!='pgap'])
	}
	else{
		for(i in seq(from=1,to=nrow(v))){
			args<-c(indexed[[i]],common)
			args$x<-unlist(v[i,])
			do.call(lines, args[names(args)!='pgap'])
			
			if(!is.null(args$pgap)){
				every<-c(seq(to=length(args$x), by=args$pgap), length(args$x))
				args$y<-args$x[every]
				args$x<-every
				args$type='p'
				do.call(points, args[names(args)!='pgap'])
			}
		}
	}
}

fallback<-function(bugs, name, first, ...){
	bugs[[name]]<-bugs[[first]]
	for(other in list(...)){
		empty<-sapply(bugs[[name]], length)==0
		bugs[empty,][[name]]<-bugs[empty,][[other]]
	}
	return(bugs)
}

evaluate_fallback_algorithms<-function(bugs, measures){
	
	bugs<-fallback(bugs, 'ta_da', 'ta', 'da')
	bugs<-fallback(bugs, 'da_ta', 'da', 'ta')
	bugs$match<-mapply(intersect, bugs$ta, bugs$da)
	
	bugs<-get_prfa(bugs, 'ta_da')
	bugs<-get_prfa(bugs, 'da_ta')
	bugs<-get_prfa(bugs, 'match')
	
	t<-evaluate_change(bugs, 'ta', 'ta_da', measures)
	d<-evaluate_change(bugs, 'da', 'da_ta', measures)
	ret<-rbind(t, d)
	rownames(ret)<-c("TA", "TA,DA", "DA", "DA,TA")
	
	ret
}

evaluate_ta_single_result<-function(bugs, measures){
	first<-function(x){
		if(length(x)==0){
			return(character(0))
		}
		
		x[1]
	}
	
	# This isn't actually true, but for the non-Superset responses (which are already set)
	# we don't really care which result gets returned, only how many & if it's right or wrong
	unset<-is.na(bugs$frequent)
	bugs[unset,]$frequent<-sapply(bugs[unset,'ta'], first)
	bugs<-get_prfa(bugs, 'frequent')
	
	unset<-is.na(bugs$recent)
	bugs[unset,]$recent<-sapply(bugs[unset,'ta'], first)
	bugs<-get_prfa(bugs, 'recent')
	
	ret<-evaluate_change(bugs, 'ta', c('frequent', 'recent'), measures)
	rownames(ret)<-c("TA", "Majority of Lines", "Most Recent")

	ret
}

evaluate_change<-function(bugs, base, changes, measures){
	n<-function(...){
		paste(..., sep='_')
	}
	
	approaches<-c(base, changes)
	names(approaches)<-approaches
	evaluate_approaches(bugs, approaches, measures)
}

get_results_by_class<-function(bugs, approach){
	data<-split(bugs, bugs$origin_class, drop=TRUE)
	f<-function(measure){
		col<-paste(approach, measure, 'l', sep='_')
		s<-function(x){
			sum(x[[col]])
		}
		r<-sapply(data, s)
	}
	
	ret<-sapply(c('tp', 'fp', 'fn'), f)
	colnames(ret)<-c('TP', 'FP', 'FN')
	
	ret
}

split_strings<-function(strings){
	sapply(as.character(strings), strsplit, split='|', fixed=TRUE)
}

get_class<-function(origin,origin_class,result){
	if(length(result) == 0 || (length(result) == 1 && (is.na(result) || result == ''))){
		return('Nothing')
	}
	
	if(length(origin) == 1 && origin_class == 'Unclear'){
		return('Unclear')
	}
	
	if(length(origin) == 1 && origin_class %in% c('Elsewhere', 'Unrelated', 'Related')){
		return('Incorrect')
	}
	
	x<-intersect(origin, result)
	
	if(length(x) == 0){
		return('Incorrect')
	}
	
	o<-setdiff(origin,x)
	r<-setdiff(result,x)
	
	if(length(o) == 0 && length(r) == 0){
		return('Correct')
	}
	
	if(length(o) == 0){
		return('Superset')
	}
	
	if(length(r) == 0){
		return('Subset')
	}
	
	return('Mixed')
}

get_prfa<-function(bugs, measure){
	tp<-mapply(intersect, bugs$origin, bugs[[measure]])
	fn<-mapply(setdiff, bugs$origin, tp)
	fp<-mapply(setdiff, bugs[[measure]], tp)
	
	tp_l<-sapply(tp, length)
	fn_l<-sapply(fn, length)
	fp_l<-sapply(fp, length)
	tn_l<-sapply(bugs$version, get_age) - 1 - tp_l - fn_l - fp_l
	
	precision<-tp_l/(tp_l+fp_l)
	recall<-tp_l/(tp_l+fn_l)
	fvalue<-2*precision*recall/(precision+recall)
	accuracy<-(tp_l+tn_l)/(tp_l+fp_l+tn_l+fn_l)
	
	df<-data.frame(I(tp), I(fn), I(fp), tp_l, fn_l, fp_l, tn_l, precision, recall, fvalue, accuracy)
	names(df)<-sapply(names(df), function(x){paste(measure,x,sep='_')})
	bugs<-cbind(bugs, df)
	return(bugs)
}

split_version<-function(x){
	if(length(x)==0||is.na(x)||x==''){
		return(NA)
	}
	
	versions<-split_strings(x)[[1]]
	earliest<-c(1,Inf)
	for(i in seq(along=versions)){
		v<-as.numeric(strsplit(versions[i], ".", fixed=TRUE)[[1]])
		if(length(v) == 1){
			return(NA)
		}
		
		p<-2
		while(p<=length(v) && p<=length(earliest)){
			if(v[p]<earliest[p]){
				earliest<-v
				break
			}
			
			if(v[p]==earliest[p] && length(earliest) > length(v)){
				earliest<-v
				break
			}
			p<-p+2
		}
	}
	return(earliest)
}

get_age<-function(fix, origin='0.0'){
	o<-split_version(origin)
	f<-split_version(fix)
	
	if(is.na(o) || is.na(f)){
		return(NA)
	}
	
	age<-0
	while(length(f)>length(o)){
		age<-age+f[length(f)]
		f<-f[1:(length(f)-1)]
	}
	
	while(length(o)>2){
		age<-age+(f[length(f)]-o[length(o)])
		o<-o[1:(length(o)-2)]
		f<-f[1:(length(f)-2)]
	}
	
	age<-age+(f[length(f)]-o[length(o)])
	return(age)
}

blank<-function(x){
	if(sum(is.na(x))>0){
		return(character(0))
	}
	
	x
}

get_data<-function(file){
	data<-scan(file=file, what=list("file", "version", "bug", "type", "origin", "ta", "da", "origin_class", "frequent", "recent", "tags"), sep=",", na.strings="?", skip=1)
	bugs<-as.data.frame(data, stringsAsFactors=FALSE)
	names(bugs)<-c("file", "version", "bug", "type", "origin", "ta", "da", "origin_class", "frequent", "recent", "tags")
	bugs<-bugs[bugs$type=='Bug',]
	
	bugs$origin<-split_strings(bugs$origin)
	bugs$ta<-split_strings(bugs$ta)
	bugs$da<-split_strings(bugs$da)
	bugs$tags<-split_strings(bugs$tags)
	
	bugs$bug<-factor(bugs$bug)
	bugs$origin_class<-factor(bugs$origin_class, c("Single", "Multiple", "Related", "Elsewhere", "Unrelated", "Unclear"), ordered=TRUE)
	
	bugs$origin<-c(sapply(bugs$origin, blank))
	bugs$ta<-c(sapply(bugs$ta, blank))
	bugs$da<-c(sapply(bugs$da, blank))
	
	bugs$born<-sapply(bugs$origin, get_age)
	bugs$age<-mapply(get_age, bugs$version, bugs$origin)
	
	bugs<-get_prfa(bugs, 'ta')
	bugs<-get_prfa(bugs, 'da')
	bugs$bug_size<-table(bugs$bug)[bugs$bug]
	
	return(bugs)
}
