#Script to perform interval matching of short read sequence intervals and predicted gene models (Glyma1.01 genome assembly).

clustercount<-function(j,filename){
	#sapply allows for intervals to be used in the function
	sapply(j,function(i){
	
		temp1<-solexa[,4:5]
		tempsoymap<-soymap[i,4:5]

		temp3<-temp1[which((temp1[,1]<=tempsoymap[[1]] & temp1[,2]>=tempsoymap[[2]]) | (temp1[,1]<=tempsoymap[[2]] & temp1[,2]>=tempsoymap[[2]]) | (temp1[,1]>=tempsoymap[[1]] & temp1[,1]<=tempsoymap[[2]]) | (temp1[,2]>=tempsoymap[[1]] & temp1[,2]<=tempsoymap[[2]])),]

			write.table(cbind(soymap[i,],dim(temp3)[1]), file=filename, append=T,quote=F,col.names = F)

	})
	
}

library(intervals)
#read in data
soyin<-read.table('~/solexa/glymacoords/Glyma1.col5')
solexain<-read.table('./solexa.col5')

names<-as.matrix(read.table('./names.txt'))

#set counters
countNomap=0

#For each chromosome/scaffold set up each soy and solexa variable 
#and then proceed processing to the same file each time
#for (n in 21:21){
for (n in 1:length(names)){
		print(n)
		print(names[n])
		
		#take only those sequences and gene models that correspond with the chromosome/scaffold
		solexa<-solexain[which(solexain[,3]==names[n]),]
		soy<-soyin[which(soyin[,3]==names[n]),]

		#check to see if there are any solexa and soy sequences for this scaffold
		#this will not be an issue if you create the names file from the solexa data each time
		#however for testing purposes on small sets I put it here
		
		soyinlength=length(which(soyin[,3]==names[n]))
		solexainlength=length(which(solexain[,3]==names[n]))
		if (solexainlength==0 & soyinlength==0){
			next
		}

			
		#check to see of there are any solexa sequences for this chromosome/scaffold
		#if there aren't write out the gene models with 0 counts to the outfile and go to
		#the next chromosome/scaffold
		if (solexainlength==0){
			soy2<-as.data.frame(soy)
			soy2$V6=0 #map count -- how many solexa reads map to this model
			soy2$V7=soy$V4
			soy2$V8=soy$V5
			write.table(soy2, file="GlymaIDCounts.txt", append=T,quote=F,col.names = F)
			next
		}
		
		
		#check to see if there are solexa sequences and see if there are
		#gene models for this chromosome/scaffold.  If no genes then write out the
		#solexa sequences to a no map to genemodel output file
		
		if (soyinlength==0 & solexainlength!=0){
			print(paste("no gene models for",names[n]))	
			countNomap=countNomap+dim(solexa)[1]
			write.table(solexa, file="GSNAPIDnoMap.txt", append=T,quote=F,col.names = F)
			next
		}
	
		#Now that we know we have both solexa reads and gene models for this chromosome/scaffold
		#convert the data to intervals for comparison
		soyI<-Intervals(soy[,4:5])
		solexaI<-Intervals(solexa[,4:5])
		
		
		#find those in the solexa data that don't map to gene models and write them out to a separte file
		d2nsolexa<-distance_to_nearest(solexaI,soyI)
		if (length(which(d2nsolexa>0))>0){
			solexa[which(d2nsolexa>0),]->solexanomap
			countNomap=countNomap+dim(solexanomap)[1]
			write.table(solexanomap, file="GSNAPIDnoMap.txt", append=T,quote=F,col.names = F)
		}
		
		
		#find those in the gene models that are mapped to the solexa data
		d2nsoy<-distance_to_nearest(soyI,solexaI)
		soy[which(d2nsoy>0),]->soynomap  #those gene models that do not map
		soy[which(d2nsoy==0),]->soymap   #those gene models that do map to the solexa data
		
		#count how many of the solexa reads map to the gene model 
		#if there is > 0 that map then run clustercount defined above that accounts 
		#for sequences that overlap partially with the gene model by expanding the gene model
		soyany<-length(which(d2nsoy==0))  #how many map to the solexa data
		if (soyany != 0){
			clustercount(1:dim(soymap)[1],"GlymaIDCounts.txt")
		}

		#output the nomap glymaID and Glymacoords in the same format with a 0 for the map count
		if(length(which(d2nsoy>0))!=0){
			soynomap2<-as.data.frame(soynomap)
			soynomap2$V6=0 #map count -- how many solexa reads map to this model
			soynomap2$V7=soynomap$V4
			soynomap2$V8=soynomap$V5
			write.table(soynomap2, file="GlymaIDCounts.txt", append=T,quote=F,col.names = F)
		}
print(paste("countNomap",countNomap))
}

