#!/usr/bin/perl
#############################################################################
#
#
#############################################################################
#############################################################################
#############################################################################
# Written for CIDR by Peter Askovich, 2015 ##################################
# Program takes list of genes as input, gets sequences from #################
# promoter2014 database and calculates clover scores for selected ###########
# promoters (based on Jaspar DB) #########################################
# peter.askovich.cidresearch.org ##########################################
#############################################################################
#############################################################################
###############################################################################
#VARIABLE AND SUBROUTINE DECLARATIONS------------------------------------------
###############################################################################
use strict;
#$|++;
require Exporter;
use DBI;
use CGI qw(:standard);
use lib "modules";
use LWP;
# custom modules ###########
use errorHandling qw(
printError
$debugInfo
);
############################
#VARIABLE DECLARATIONS---------------------------------------------------------
my $testing = 0; # if true, extra information is printed
my %formValues;
my $debugInfo;
my $searchFormFile = 'files/searchForm.html';
my $resultTemplateFile = 'files/resultTemplate.html';
my $dbh;
my $tmpFastaFile; #filename for the tmpSequenceFile (to be used for clover search)
my $tmpFilteredMatrixFile; #filename for the tmpMatrixFile (to be used for clover search)
my $tmpCloverResultFile; #filename for the output file from the clover program
#my $transfacMatrixFile = "files/TRANSFAC2010_forCloverWithNumbers.txt";
#my $transfacMatrixInfoFile = "files/matrix.dat";
my $jasparMatrixFile = "files/jasparPfmVertebrates.forClover.2014.12.10.txt";
#my $useDB; # 1 if we are using TRANSFAC; 2 if we are using JASPAR
#my @transfacMotifList; #list of transfac motifs for filtering passed by the form to the program
my @motifList; #list of jaspar motifs for filtering passed by the form to the program
my $topMessage; #collects information throughout the program on important params and displays it on top of the results page
my $totalSequencesUsed; #total number of sequeces retrieved from the DB for searching
my %motifPubmedLink;
my %fullMotifInfo;
#my %fullMotifInfoJ;
my %motifTF;
#SUBROUTINE DECLARATIONS-------------------------------------------------------
sub connectDB ($);
sub disconnectDB ();
sub openHTMLTemplate (@);
sub closeHTMLTemplate ();
sub showMainSearchPage ();
sub getSequences ();
sub filterMotifs ();
sub runClover ();
sub loadMotifInfo ();
sub parseAndPrintCloverResults ();
sub commify ($);
###############################################################################
###############################################################################
###############################################################################
#PROGRAM STARTS HERE-----------------------------------------------------------
###############################################################################
#print "Content-type:text/html\n\n";
### Read the POST buffer then and populate $formValues
my $buffer;
read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
my @pairs = split(/&/, $buffer);
foreach my $pair (@pairs) {
my ($name, $value) = split(/=/, $pair);
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$formValues{$name} = $value;
$debugInfo .= "NAME:$name VALUE:$value<br>\n";
push @motifList, $value if $name eq "motif";
}
## save the ENV if in debug
if ($testing) {
foreach my $key (keys %ENV) {
$debugInfo .= "Key:\.$key\. Value:\.$ENV{$key}\.<br>\n";
}
}
################################################################################
################################################################################
#no matter what, we need this
print "Content-type:text/html\n\n";
my $action = $formValues{'action'};
### MAIN BRANCH POINT @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
### since this is if - elsif only one of the following can be executed at the time
#if ($action eq 'searchPeptides') { # if user posts login form
# searchPeptides();
#}
if ($action eq 'getMotifPrediction') {
#get sequences first, write time to tmpFile
getSequences();
#parse motif file down to motifs from the query, write it to tmpFile
filterMotifs();
#run clover search using parameters fromt he form, write output to a tempFile
runClover();
#TODOTODO
#parse clover results and show them on the screen
loadMotifInfo();
#TODOTODO
#parse clover results and show them on the screen
parseAndPrintCloverResults();
}
else { # if no action was specified, show main page
showMainSearchPage();
}
### END MAIN BRANCH POINT @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
################################################################################
################################################################################
print "Debug Info: $debugInfo\n" if $testing;
print "Have a very nice day :-)\n" if $testing;
###############################################################################
#PROGRAM ENDS HERE-------------------------------------------------------------
###############################################################################
###############################################################################
###############################################################################
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
###SUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBSSUBS
### showMainSearchPage() ######################################################
### Prints main search page ###################################################
### First it gets information from the database for few fields and ############
### creates a form containing this info so that users can choose ##############
### some basic search parameters ##############################################
sub showMainSearchPage() {
my $jasparForm = " <select name=\"motif\" size=\"12\" id=\"42\" multiple=\"multiple\" style=\"width: 170px;\">\n";
#get JASPAR motif lists
open (JASPARMATRIXFILE, "$jasparMatrixFile")
or printError ("Error opening search form file($jasparMatrixFile)", $debugInfo, $testing);
while(my $tmpJLine = <JASPARMATRIXFILE>) {
$tmpJLine =~ s/$1//g if m/(\r\n?|\n\r?)/; #instead of chomp
if($tmpJLine =~ m/^>(.*)/) {
$jasparForm .= " <option value=\"$1\">$1</option>\n"
};
}
close JASPARMATRIXFILE;
$jasparForm .= " </select><br>\n";
# open the saarchForm.html file
open (SEARCHFORMFILE, "$searchFormFile")
or printError ("Error opening search form file($searchFormFile)", $debugInfo, $testing);
while(<SEARCHFORMFILE>) {
my $tmpLine = $_;
$tmpLine =~ s/$1//g if m/(\r\n?|\n\r?)/; #instead of chomp
#print "<pre>TMPLINE:$tmpLine</pre><br>\n";
if($tmpLine eq "<!--INSERT JASPAR LIST//-->") {
print "$jasparForm\n";
}
else {
print "$tmpLine\n";
}
}
close SEARCHFORMFILE
or printError ("Error closing search form file ($searchFormFile)", $debugInfo, $testing);
}
### END of displayMainSearchPage() ############################################
### folterMotifs() ###########################################################
### Takes the list of motifs (Transfac or Jaspar) from the formValues #########
### uses it to filter motifs from the file containing all motifs ##############
### and saves filtered list into a tmpFile for clover run #####################
sub filterMotifs() {
#my $useDB; # 1 if we are using TRANSFAC; 2 if we are using JASPAR #defined above
my $matrixFile; #transfac or jaspar
#my @filterList;
#my $matrixFile = 'TRANSFAC2010_Matrices_forCloverWithNumbers.txt'; #transfac or $jasparMatrixFile
#my $listFile = 'motifList.txt'; #transfac or jaspar motif list from formValues
#my $outFile = 'downIFNAR.1m.filtered.matrix.txt'; #tmpFile
#@filterList = @motifList;
$matrixFile = $jasparMatrixFile;
#create tmpFilteredMatrixFile name here
#my $tmpFilteredMatrixFile; #declared above
my $done = 0;
my $n = 1;
while (!$done) {
$tmpFilteredMatrixFile = "tmp/tmpFilteredMatrixFile" . "$n" . ".matrix";
if (-e $tmpFilteredMatrixFile) {$n++}
else {$done++}
}
#open matrixFile for reading and tmpFilteredMatrixFile for writing
#filter matrixFile, write output to the tmpFilteredMatrixFile
open (MATRIXFILE, $matrixFile)
or printError ("error opening matrix file($matrixFile)", $debugInfo, $testing);
open (TMPFILTEREDMATRIXFILE, ">$tmpFilteredMatrixFile")
or die ("could not open tmpFilteredMatrixFile for writing");
my @data; #2D array holding binding site info
my $e = 0; #var that counts how many TFs we worked on
my $f = 0; #counts how many TFs matched
my $currentTF;
while (<MATRIXFILE>) {
#copy line to $line, remove new line
my $line = $_;
$line =~ s/$1//g if m/(\r\n?|\n\r?)/; #instead of chomp
#if the line starts with the ">" check if the previous TF matches the list and print it out if it does
if ($line =~ m/^>/) {
# trim ">" for matching
$line =~ s/^>//;
if ($currentTF) { #this will skip first line
foreach my $motifListItem (@motifList) { #go throug each of the motifs submitted by the web page
if ($currentTF eq $motifListItem) { #if it matches current TF, write it up to out file
print TMPFILTEREDMATRIXFILE ">$currentTF\n";
foreach my $aref ( @data ) {print TMPFILTEREDMATRIXFILE "@$aref\n";}
$f ++;
last;
}
}
}
undef @data;
$currentTF = $line;
$currentTF =~ s/^>V\$//;
$e++;
}
else {
# split line into array, push it into 2D array
my @tmp = split / /, $line;
push @data, [ @tmp ];
}
#print "." if $e % 50 == 0;
#last if $e> 10;
}
print "\nProcessed $e Transcription Factors $f saved to $tmpFilteredMatrixFile<br>\n" if $testing;
$topMessage .= "<p>Total of $e motifs found in database, using $f selected ones</p>\n";
close MATRIXFILE;
close TMPFILTEREDMATRIXFILE;
}
### END filterMotifs () #######################################################
### getSequences() ############################################################
### Fethces sequences from DB, then saves them to the tmpFile #################
### It will use ID type, species and list of IDs to get seqs ############
### ###########################################################################
sub getSequences() {
# rename form hash variables for easier handling
my $mappings = $formValues{'mappings'};
my $longest = $formValues{'longest'};
my $species = $formValues{'species'};
my $before = $formValues{'before'};
my $after = $formValues{'after'};
my $mask = $formValues{'mask'};
# process ids from the form in the format we can use ##############################
my @ids = split /\r\n/, $formValues{'ids'}; # array list of IDs to look up
my $numIDs = @ids; # number of IDs passed to the function
my $ids; # formatted version of IDs
foreach my $id (@ids) {
$id =~ s/'/\\\'/g; #instead of chomp
$id =~ s/"/\\\"/g; #instead of chomp
$id =~ s/;/\\\;/g; #instead of chomp
$ids .= "\'$id\', ";
}
$ids =~ s/\W*$/\'/;
###################################################################################
my @results; # 2D array holding sequences and annotation for requested genes
# assemble print staments to let users know what we are doing ##############################################################
my $mappingStatus = "$mappings mappings";
$mappingStatus .= " (longest sequence only)" if $longest;
my $maskedStatus;
if ($mask == 0) {$maskedStatus = 'Nothing was masked'}
elsif ($mask == 1) {$maskedStatus = 'Repeats were masked'}
else {$maskedStatus = 'unknown!'}
#############################################################################################################################
# put the query together now ###############################################################################
my $longestOnly = 'AND longest=1' if $longest;
my $getSeqSQL = "
SELECT geneName, geneID, tss, longest, chromosome, start, stop, strand, species, genomeVersion, sequence
FROM main
WHERE geneName IN ($ids) AND species='$species' AND mappings='$mappings' $longestOnly
ORDER BY geneName";
############################################################################################################
# if we are in the testing mode, just pring query and return ##############
if ($testing) {
print "<br><pre>$getSeqSQL</pre>";
#return;
}
###########################################################################
# now get sequences and annotation for the requested genes from the DB ############################################################
connectDB('promoter2014');
my ($t0, $t1, $t2, $t3, $t4, $t5, $t6, $t7, $t8, $t9, $t10);
my $sth = $dbh->prepare ("$getSeqSQL")
or printError("greb", "$debugInfo<br>\npreparing statement $DBI::err ($DBI::errstr)", $testing);
$sth->execute()
or printError("ff443s", "$debugInfo<br\nexecuting $DBI::err ($DBI::errstr)", $testing);
# geneName, geneID, tss, longest, chromosome, start, stop, strand, species, genomeVersion, sequence
$sth->bind_col (1, \$t0); # geneName
$sth->bind_col (2, \$t1); # geneID
$sth->bind_col (3, \$t2); # tss
$sth->bind_col (4, \$t3); # longest
$sth->bind_col (5, \$t4); # chromosome
$sth->bind_col (6, \$t5); # start
$sth->bind_col (7, \$t6); # stop
$sth->bind_col (8, \$t7); # strand
$sth->bind_col (9, \$t8); # species
$sth->bind_col (10, \$t9); # genomeVersion
$sth->bind_col (11, \$t10); # sequence
my $k = 0;
while($sth->fetch) {
$results[$k][0] = $t0; # geneName
$results[$k][1] = $t1; # geneID
$results[$k][2] = $t2; # tss
$results[$k][3] = $t3; # longest
$results[$k][4] = $t4; # chromosome
$results[$k][5] = $t5; # start
$results[$k][6] = $t6; # stop
$results[$k][7] = $t7; # strand
$results[$k][8] = $t8; # species
$results[$k][9] = $t9; # genomeVersion
$results[$k][10] = $t10; # sequence
$k++;
}
$sth->finish ()
or printError("ZZod003", "$debugInfo<br\nfinishing $DBI::err ($DBI::errstr)", $testing);
disconnectDB();
print "Successfully completed SQL<br>\n" if ($testing);
$topMessage .= "<p>Total of $numIDs genes were submitted for analysis, $k sequences retrieved from the DB</p>\n";
$topMessage .= "<p>Promoter sequences were derived using $mappingStatus <br>($species - $results[0][9])</p>";
$topMessage .= "<p>Promoter sequence length is $before before and $after after TSS ($maskedStatus)</p>\n";
#save number of genes that were used in search for the statistcs later
$totalSequencesUsed = $k;
###################################################################################################################################
# trim sequences to desired length #########################################################
# mask repeats if needed and format the sequence to 60 bp per line #########################
# we are starting from 2000/2000 except in rare case when the before section is shorter ####
$k = 0;
while ($results[$k][10]) {
my $seqLength = length $results[$k][10];
if ($seqLength < 4000) {
}
my $offset = 2000 - $before;
my $length = $before + $after;
my $trimmed = substr $results[$k][10], $offset, $length;
# mask repeats if that was requested
if ($mask == 1) {
$trimmed =~ tr/[atgc]/N/;
}
$trimmed =~ s/(.{60})/$1\n/g;
$trimmed =~ s/\n\n/\n/; #in case number of BP is divisible by 60, end can have two \n ... maybe
# save it back to the 2D array results
$results[$k][10] = $trimmed;
$k++;
}
############################################################################################
# finaly print the results to a tmpFile in tmp dir ####################################################################################
#######################################################################################################################################
#my $tmpFastaFile; Declared above, all the way at the top
my $done = 0;
my $n = 1;
while (!$done) {
$tmpFastaFile = "tmp/tmpSequences" . "$n" . ".fasta";
if (-e $tmpFastaFile) {$n++}
else {$done++}
}
open TMPFASTAFILE, ">$tmpFastaFile"
or printError('123', "could not open TMPFASTAFILE $tmpFastaFile<br>\n", '123');
$k = 0;
while ($results[$k][0]) {
print TMPFASTAFILE ">$results[$k][0]\|$results[$k][1]\|tss:$results[$k][2]\|chr:$results[$k][4]\|strand:$results[$k][7]\|longest:$results[$k][3]\|species:$results[$k][8]\|genomeVersion:$results[$k][9]\n";
print TMPFASTAFILE "$results[$k][10]\n";
$k++;
}
close TMPFASTAFILE;
}
### END getSequences() ########################################################
### runClover() ###############################################################
### runs clover program using two files prepared above ########################
### tmpSequenceFile and tmpMatrixFile #########################################
### ###########################################################################
sub runClover() {
my $cloverCommandLine;
#make tmpCloverResultFile name here
#my $tmpCloverResultFile; #declared above
my $done = 0;
my $n = 1;
while (1) {
$tmpCloverResultFile = "tmp/tmpCloverResultFile" . "$n" . ".result";
last unless -e $tmpCloverResultFile;
$n++;
}
$cloverCommandLine = "bin/clover -u $formValues{'minCloverScore'} $tmpFilteredMatrixFile $tmpFastaFile > $tmpCloverResultFile";
print "\n<br>Clover command line is $cloverCommandLine<br>\n" if $testing;
my $cloverResult = `$cloverCommandLine`;
#print "\n<br>Clover output was $cloverResult<br>\n" if $testing;
}
### END runClover() ###########################################################
### sub loadMotifInfo() #######################################################
### preloads the whole matrix.dat file into set of useful vars ################
### this is used to later display links/info etc. #############################
sub loadMotifInfo() {
### go through all selected motifs and retrieve information from the db
my $jm = 0;
while ($motifList[$jm]) {
print "MotifList[jm] $motifList[$jm]<br>\n " if $testing;
my $mNumber;
my $mVersion;
$motifList[$jm] =~ m/^([a-zA-Z0-9_]*)\.(\d)\s/;
$mNumber = $1;
$mVersion = $2;
print "ID=$mNumber version=$mVersion<br>\n" if $testing;
# put the query together now ###############################################################################
my $getJasparInfoSQL = "
SELECT a.ID, a.BASE_ID, a.VERSION, a.COLLECTION, a.NAME, b.ID, b.TAG, b.VAL
FROM MATRIX a, MATRIX_ANNOTATION b
WHERE a.BASE_ID = '$mNumber' and a.ID = b.ID and a.VERSION=$mVersion";
############################################################################################################
print "<br><pre>$getJasparInfoSQL</pre>" if $testing; # testing only
###########################################################################
# now get Jaspar info from the DB ############################################################
connectDB('Jaspar2014');
my ($t0, $t1, $t2, $t3, $t4, $t5, $t6, $t7);
my $sth = $dbh->prepare ("$getJasparInfoSQL") or printError("greb", "$debugInfo<br>\npreparing statement $DBI::err ($DBI::errstr)", $testing);
$sth->execute() or printError("werwer3344", "$debugInfo<br\nexecuting $DBI::err ($DBI::errstr)", $testing);
$sth->bind_col (1, \$t0); # ID - MATRIX
$sth->bind_col (2, \$t1); # BASE_ID - MATRIX
$sth->bind_col (3, \$t2); # VERSION - MATRIX
$sth->bind_col (4, \$t3); # COLLECTION - MATRIX
$sth->bind_col (5, \$t4); # NAME (TF) - MATRIX
$sth->bind_col (6, \$t5); # ID - MATRIX_ANNOTATION
$sth->bind_col (7, \$t6); # TAG - MATRIX_ANNOTATION
$sth->bind_col (8, \$t7); # VAL - MATRIX_ANNOTATION
while($sth->fetch) {
my $fullMotifName = $t1 . "." . $t2;
#print "FullMotifName =\.$fullMotifName\. t1=\.$t1\. t2=$t2 t3=$t3 t4=$t4 t5=$t5 t6=$t6 t7=$t7 <br>\n";
$motifTF{$fullMotifName} = $t4;
$motifPubmedLink{$fullMotifName} = "http://www.ncbi.nlm.nih.gov/pubmed?term=$t7" if $t6 eq 'medline';
$fullMotifInfo{$fullMotifName} .= "$t6 - $t7\\n";
}
$sth->finish () or printError("ZZod003", "$debugInfo<br\nfinishing $DBI::err ($DBI::errstr)", $testing);
disconnectDB();
#print "Successfully completed SQL<br>\n" if ($testing);
$jm++;
}
#for my $key (keys %motifPubmedLink) {
# print "motif name = $key and link is $motifPubmedLink{$key}<br>\n";
#}
}
### END sub loadMotifInfo() ###################################################
### parseCloverResults() ######################################################
### takes the output of the temporary result file, formats it #################
### and displays it on the screen (with added links etc. ######################
### ###########################################################################
sub parseAndPrintCloverResults() {
my $geneName;
my $geneID;
my $tss;
my $chr;
my $strand;
my $longest;
my $species;
my $genomeVersion;
my $beforeTss = $formValues{'before'};
my $totalSequenceLength = $formValues{'before'} + $formValues{'after'};
my %totalMotifCount;
my %totalMotifScore;
my %totaGeneCount;
my %geneMotifHash;
my %genesWithMotif;
my $result; # This is where all the results go, to be printed at the end of the function
#print the file header first
#print "<!DOCTYPE html PUBLIC \"XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
#print "<html lang=\"en\">\n";
#print "<head>\n";
#print " <title>Motif Search on Penguin</title>\n";
#print " <link type=\"text/css\" href=\"jss/style.css\" rel=\"stylesheet\" />\n";
#print "</head>\n";
#print "<body>\n";
#print topMessage first TOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGE
$result = "\n";
$result .= $topMessage;
$result .= "<p> </p>\n";
#end of topMessage TOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGETOPMESSAGE
#this will contain the complete code for the full result table, sorted by Motif
my $resultTableByMotif = "<table class=\"detailT\" title=\"clover results\">\n";
open (TMPCLOVERRESULTFILE, "$tmpCloverResultFile")
or printError ("Error opening tmpCloverResultFile ($tmpCloverResultFile)", $debugInfo, $testing);
#print "<pre>\n";
#print "<table align=\"center\" border=\"3\" title=\"clover results\">\n";
my $gray = 1; #every second line in the table should be gray.
my $s = 0; #tracks 1st dimension of the fullMotifGeneSummary 2D array
my @fullMotifGeneSummary; #2D array that holds complete information on all motifs/genes from clover results
while(<TMPCLOVERRESULTFILE>) {
my $line = $_;
$line =~ s/$1//g if m/(\r\n?|\n\r?)/; #instead of chomp
# This is why people hate Perl, but this is also why Perl is powerful. This RegEx will match and parse the whole ID line!!!
if ($line =~ m/^>(.*)\|(.*)\|tss:(.*)\|chr:chr(.*)\|strand:(.*)\|longest:(.*)\|species:(.*)\|genomeVersion:(.*)/) {
$geneName = $1;
$geneID = $2;
$tss = $3;
$chr = $4;
$strand = $5;
$longest = $6;
$species = $7;
$genomeVersion = $8;
$resultTableByMotif .= "<tr><td colspan=\"7\">\n";
$resultTableByMotif .= "<h2><hr><br>$geneName</h2><h5>($geneID) Chromosome:$chr Strand:$strand TSS:" . commify($tss) . "</h5>\n";
$resultTableByMotif .= "</td></tr>\n";
$resultTableByMotif .= "<tr>\n";
$resultTableByMotif .= "<th><strong>Motif</strong></th>\n";
$resultTableByMotif .= "<th><strong>Start</strong></th>\n";
$resultTableByMotif .= "<th><strong>Stop</strong></th>\n";
#$resultTableByMotif .= "<td><b>Strand</b></td>\n";
$resultTableByMotif .= "<th><strong>AbsStart</strong></th>\n";
$resultTableByMotif .= "<th><strong>AbsStop</strong></th>\n";
$resultTableByMotif .= "<th><strong>Sequence</strong></th>\n";
$resultTableByMotif .= "<th><strong>Score</strong></th>\n";
$resultTableByMotif .= "</tr>\n";
#undef $geneName; undef $geneID; undef $tss; undef $chr; undef $strand; undef $longest; undef $species; undef $genomeVersion;
$gray = 1;
}
elsif ($line =~ m/^(.*)\s+(\d+)\s+-\s+(\d+)\s+([\+-])\s+(\w+)\s+([-+]?[0-9]*\.?[0-9]+)/) {
my $motif = $1;
my $start = $2;
my $stop = $3;
#my $strand = $4; # have this from the ID line
my $absStart;
my $absStop;
my $sequence = $5;
my $score = $6;
#cleanup for Jaspar ###############
$motif =~ s/\s+$//g;
$motif =~ s/^\s+//g;
#####################################
#print "Motif=\.$motif\.<br>\n";
if ($strand eq "+") {
$start -= $beforeTss;
$stop -= $beforeTss;
$absStart = $tss + $start;
$absStop = $tss + $stop;
}
elsif ($strand eq "-") {
$start -= $beforeTss;
$stop -= $beforeTss;
$absStart = $tss - $start;
$absStop = $tss - $stop;
}
my $colorCode;
if ($gray) {
$colorCode = "bgcolor=\"#c0c0c0\"";
$gray = 0;
}
else {
$gray++;
}
#add motif/gene information to the $resultTableByMotif to be displayed later ########################################
$resultTableByMotif .= "<tr>\n";
$resultTableByMotif .= "<td>$motif</pre></td>\n";
$resultTableByMotif .= "<td><pre>" . commify($start) . "</pre></td>\n";
$resultTableByMotif .= "<td><pre>" . commify($stop) . "</pre></td>\n";
#$resultTableByMotif .= "<td><b>$strand</b></td>\n";
$resultTableByMotif .= "<td><pre>" . commify($absStart) . "</pre></td>\n";
$resultTableByMotif .= "<td><pre>" . commify($absStop) . "</pre></td>\n";
$resultTableByMotif .= "<td><pre>$sequence</pre></td>\n";
$resultTableByMotif .= "<td><pre>$score</pre></td>\n";
$resultTableByMotif .= "</tr>\n";
#######################################################################################################################
#add information to the grandSummaryArray for motif/gene statistics #####
#########################################################################
#$fullMotifGeneSummary[$s][0] = $geneName; #geneName
#$fullMotifGeneSummary[$s][1] = $motif; #motif
#$fullMotifGeneSummary[$s][2] = $score; #motif score
$s++;
$totalMotifCount{$motif}++; #all occurances for each motif
$totalMotifScore{$motif} += $score; #all occurances for each motif
$totaGeneCount{$geneName} = 1; #not sure what to do with this yet
my $motifAndGene = $motif . "\@" . $geneName;
$geneMotifHash{$motifAndGene}++; # gene@motif combo - used later to count numbef of genes that have each motif
#########################################################################
}
else {
#tmp
print "ELSE LINE:$line<br>\n" if $testing;
}
#print "$line\n";
}
close TMPCLOVERRESULTFILE;
$resultTableByMotif .= "</table>";
#print "</pre>\n";
#print "$resultTableByMotif"; moved to the bottom of the sub here
### Analyze all matches and calculate statistics for the summary-by-motif table, then print the table #############################
###################################################################################################################################
###################################################################################################################################
$result .= "<h1 >Summary by motif</h1>\n";
# count number of genes in which each motif is found (convoluted)
foreach my $key (keys %geneMotifHash) {
my $tmpMotif = $key;
$tmpMotif =~ s/\@.*//;
$genesWithMotif{$tmpMotif}++;
}
#assemble 2D array with the summary per motif
#$motifSummaryArray[$m][0] == motif
#$motifSummaryArray[$m][1] == TF
#$motifSummaryArray[$m][2] == Occurences (total)
#$motifSummaryArray[$m][3] == genes (x of y)
#$motifSummaryArray[$m][4] == percent
#$motifSummaryArray[$m][5] == Average motif score
#$motifSummaryArray[$m][6] == Motifs per gene 1
#$motifSummaryArray[$m][7] == Motifs per gene 2
#$motifSummaryArray[$m][8] == Motifs per gene 3
#$motifSummaryArray[$m][9] == Motifs per gene 4
#$motifSummaryArray[$m][10] == Motifs per gene 5+
my @motifSummaryArray;
my $m =0;
my $totalNumberOfMotifs = keys (%totalMotifCount); #total number of unique motifs we found
foreach my $motifName (keys %totalMotifCount) {
#Prepare motif name with no number for matching with the motif info from the .dat file (for TRANSFAC only, it will not affect Jaspar)
my $motifNameOnly = $motifName;
my $motifNumber;
my $motifToPrint;
# if we are using TRANSFAC
$motifNameOnly =~ s/\s+(.*)//;
$motifToPrint = "$motifNameOnly $1";
#print "motifName = \.$motifName\. motifNameOnly = \.$motifNameOnly\.<br>\n";
if($motifPubmedLink{$motifNameOnly}) {
$motifSummaryArray[$m][0] = "<a href=\"$motifPubmedLink{$motifNameOnly}\" target=\"new\">$motifToPrint</a>";
}
else {
$motifSummaryArray[$m][0] = "$motifToPrint"; #motif
}
$motifSummaryArray[$m][1] = $motifTF{$motifNameOnly}; #occurences
$motifSummaryArray[$m][2] = $totalMotifCount{$motifName}; #occurences
$motifSummaryArray[$m][3] = "$genesWithMotif{$motifName} of $totalSequencesUsed"; #genes
$motifSummaryArray[$m][4] = sprintf("%.0f", ($genesWithMotif{$motifName} / $totalSequencesUsed) * 100); #genes percent
$motifSummaryArray[$m][5] = sprintf("%.1f", $totalMotifScore{$motifName} / $totalMotifCount{$motifName}); #genes percent
$motifSummaryArray[$m][11] = $motifNameOnly; #For Full info display below
# finally, fill out the motifs-per-gene columns
# geneMotifHash contains "motif@gene" as a key. First remove gene for matching
foreach my $motifGene (keys %geneMotifHash) {
my $tmpMotifOnly = $motifGene;
$tmpMotifOnly =~ s/\@.*//;
#print "tmpMotifOnly=$tmpMotifOnly and motifName=$motifName<br>\n";
if ($tmpMotifOnly eq $motifName) {
#print "SAME!<br>\n";
$motifSummaryArray[$m][6]++ if($geneMotifHash{$motifGene} == 1); #Motifs per gene 1
$motifSummaryArray[$m][7]++ if($geneMotifHash{$motifGene} == 2); #Motifs per gene 2
$motifSummaryArray[$m][8]++ if($geneMotifHash{$motifGene} == 3); #Motifs per gene 3
$motifSummaryArray[$m][9]++ if($geneMotifHash{$motifGene} == 4); #Motifs per gene 4
$motifSummaryArray[$m][10]++ if($geneMotifHash{$motifGene} > 4); #Motifs per gene 5+
}
#$genesWithMotif{$tmpMotif}++;
}
$m++
}
$result .= <<HTML;
<table class="resultT" title="summaryByMotif">
<tr>
<td class="resultT" colspan="7">
<h3>Total of $totalNumberOfMotifs unique motifs found<br>Promoter regions of $totalSequencesUsed genes searched</h3>
</td>
<td class="resultT" colspan="5">
<h3>Motifs per gene</h3>
</td>
</tr>
<tr>
<th class="tableT" >
Full
</th>
<th class="tableT" >
Motif
</th>
<th class="tableT" >
TF
</th>
<th class="tableT" >
Occurrences
</th>
<th class="tableT" >
Genes
</th>
<th class="tableT" >
%
</th>
<th class="tableT" >
AvScore
</th>
<th class="tableT" >
1
</th>
<th class="tableT" >
2
</th>
<th class="tableT" >
3
</th>
<th class="tableT" >
4
</th>
<th class="tableT" >
5+
</th>
</tr>
HTML
#sort array here as needed
@motifSummaryArray = sort { $b->[2] <=> $a->[2]} @motifSummaryArray;
$m = 0;
while ($motifSummaryArray[$m][0]) {
$result .= "<tr class\"dd\">\n";
$result .= "<td class=\"resultT\" align=\"center\"><input type=\"button\" onclick=\"alert(\'$fullMotifInfo{$motifSummaryArray[$m][11]}\')\" value=\"Info\" /></td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][0] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][1] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][2] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][3] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][4] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][5] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][6] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][7] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][8] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][9] </td>\n";
$result .= "<td class=\"resultT\">$motifSummaryArray[$m][10] </td>\n";
$result .= "</tr>\n";
$m++;
}
$result .= "</table>";
###################################################################################################################################
###################################################################################################################################
#now print the table with motif/gene details
$result .= "<p> </p>";
$result .= "<h1>Detailed results for each gene</h1>\n";
$result .= "$resultTableByMotif";
#tmptesttmptesttmptest###############
#####################################
#print "<pre>";
#foreach my $key (keys %totalMotifCount) {
# print "Motif $key has total of $totalMotifCount{$key} occurences in $genesWithMotif{$key} (out of $totalSequencesUsed genes) \n";
#}
#foreach my $key (keys %geneMotifHash) {
# print "Motif-Gene $key \n";
#}
#print "<pre>";
#####################################
#####################################
# wrap up the file
#$result .=" </body>\n";
#$result .="</html>\n";
# open the saarchForm.html file
open (RESULTTEMPLATEFILE, "$resultTemplateFile")
or die "Error opening $resultTemplateFile";
while(<RESULTTEMPLATEFILE>) {
my $tmpLine = $_;
$tmpLine =~ s/$1//g if m/(\r\n?|\n\r?)/; #instead of chomp
if($tmpLine eq "<!-- CONTENT HERE -->") {
print "$result\n";
}
else {
print "$tmpLine\n";
}
}
close RESULTTEMPLATEFILE
or die "Error closing $resultTemplateFile";
}
### END parseCloverResults() ##################################################
sub connectDB($) {
# We can now connect to the DB #################################################
my $dbName = shift; #store db name passed to the function into a variable dbName
#print "Connecting to the database<br>\n" if $testing;
$dbh = DBI->connect ("dbi:mysql:$dbName", 'peter', 'peter')
or printError("Error in connecting $DBI::err ($DBI::errstr)", 1, 1);
#print "Successfully connected<br>\n" if $testing;
################################################################################
}
sub disconnectDB() {
# All done so close the DB connection ##########################################
#print "Disconnecting<br>\n" if $testing;
$dbh->disconnect ()
or printError("In disconnecting $DBI::err ($DBI::errstr),", 1, 1);
#print "Successfully disconnected<br>\n" if $testing;
################################################################################
}
### #comify() ##################################################################
### takes a number, returns same number as comma separated for thousands #######
################################################################################
sub commify($) {
my $text = reverse $_[0];
$text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
return scalar reverse $text;
} # commify
### END comify()################################################################