Difference between revisions of "Talk:Perl"

From Christoph's Personal Wiki
Jump to: navigation, search
Line 10: Line 10:
 
     cat source/$accession.proteins.$type.sigmas.hmmsearch | \
 
     cat source/$accession.proteins.$type.sigmas.hmmsearch | \
 
     perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\
 
     perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\
              my ($start,$stop,$dir,$score,$evalue) = ($1,$2,$3,$4,$5);\
+
        my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\
              next unless $score > 0;\
+
        next unless $score>0;\
              print "update cmp_genomics.features set note = \"Sigma Factor '$type'\" \
+
        print "update cmp_genomics.features set note=\"Sigma Factor '$type'\" \
                      where start=$start and stop = $stop and user = user() and accession = \"'$accession'\";\n";'\
+
          where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\
     | mysql
+
     |mysql
 
   end
 
   end
 
end
 
end
 
</pre>
 
</pre>
 
<pre>
 
<pre>
mysql -B -e "update cmp_genomics.features set note = '' where user = USER() and note not like 'sigma%' or note is null"
+
mysql -B -e "update cmp_genomics.features set note='' where user = USER() and note not like 'sigma%' or note is null"
 
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
 
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
 
   foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK)
 
   foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK)
 
     cat source/$accession.$type.TCS.hmmsearch | \
 
     cat source/$accession.$type.TCS.hmmsearch | \
 
     perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\
 
     perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\
              my ($start,$stop,$dir,$score,$evalue) = ($1,$2,$3,$4,$5);\
+
        my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\
              next unless $score > 0; \
+
        next unless $score>0; \
              print "update cmp_genomics.features set note = \"TCS '$type'\" \
+
        print "update cmp_genomics.features set note=\"TCS '$type'\" \
                      where start=$start and stop = $stop and user = user() and accession = \"'$accession'\";\n";'\
+
          where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\
     | mysql -B
+
     |mysql -B
 
   end
 
   end
 
end
 
end
Line 39: Line 39:
 
     genomes.accession = features.accession and genomes.user = features.user and \
 
     genomes.accession = features.accession and genomes.user = features.user and \
 
     features.user = user() and featuretype='rRNA' and features.accession = '$accession'" | \
 
     features.user = user() and featuretype='rRNA' and features.accession = '$accession'" | \
   perl -ane '$F[0] = reverse ( $F[0] ) if $F[2] eq "-";$F[0] =~ tr/ATGC/TACG/ if $F[2] eq "-";\
+
   perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\
              print ">$F[1]\n"; \
+
    print ">$F[1]\n"; \
              for ( $x = 0 ; $x < length ( $F[0] ) ; $x = $x+60) { print substr ( $F[0] , $x , 60),"\n"; } last;' \
+
    for($x=0;$x<length($F[0]);$x=$x+60){print substr($F[0],$x,60),"\n";}last;' \
 
   >> source/all.16s.fsa
 
   >> source/all.16s.fsa
 
end
 
end
Line 47: Line 47:
 
</pre>
 
</pre>
 
<pre>
 
<pre>
cat ~/Ex3/source/CP000034.codon-usage.aa.list \
+
cat CP000034.codon-usage.aa.list \
 
| perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \
 
| perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \
 
> aa.dat
 
> aa.dat
Line 53: Line 53:
 
> organism.names
 
> organism.names
 
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
 
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
   cat ~/Ex3/source/$accession.codon-usage.aa.list \
+
   cat $accession.codon-usage.aa.list \
   | perl -ane 'BEGIN { foreach (`cat organism.names`) \
+
   |perl -ane 'BEGIN{foreach(`cat organism.names`) \
                      {chomp; @A = split (/\t+/,$_);$NAMES{$A[0]} = $A[1];} } \
+
      {chomp;@A=split(/\t+/,$_);$NAMES{$A[0]}=$A[1];}} \
              next unless $F[1] =~ /^[A-Z]+/;$FREQ{$F[1]} = $F[3]; push @AA,$F[1]; \
+
      next unless $F[1]=~/^[A-Z]+/;$FREQ{$F[1]}=$F[3];push @AA,$F[1]; \
              END { print "\n"; print "$NAMES{'$accession'}"; foreach $aa (@AA){print "\t",$FREQ{$aa};}}' \
+
      END{print "\n";print "$NAMES{'$accession'}";foreach $aa(@AA){print "\t",$FREQ{$aa};}}' \
 
   >> aa.dat
 
   >> aa.dat
 
end
 
end
Line 65: Line 65:
 
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
 
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
 
   perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \
 
   perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \
            {print "insert into localization (user, protid, type) \
+
    {print "insert into localization (user,protid,type) \
                          select USER() as user, ft.id as protid, \"signalp\" as type \
+
          select USER() as user, ft.id as protid, \"signalp\" as type \
                                  from features as ft \
+
          from features as ft \
                                  where ft.user=USER() and \
+
          where ft.user=USER() and \
                                        ft.accession = \"'$accession'\" and \
+
          ft.accession=\"'$accession'\" and \
                                        ft.start = $1 and ft.stop = $2 and \
+
          ft.start=$1 and ft.stop=$2 and \
                                        ft.dir = \"$3\";\n";}' $accession.signalp \
+
          ft.dir=\"$3\";\n";}' $accession.signalp \
   | mysql -D cmp_genomics
+
   |mysql -D cmp_genomics
 
end
 
end
 
</pre>
 
</pre>
 
<pre>
 
<pre>
perl -e 'foreach $i (1..11) { open IN , "col$i"; while (<IN>) { chomp;($a , $c) = split (/\t+/,$_); $DATA{$a}{$i} = $c;} close IN;} \
+
perl -e 'foreach $i(1..11){open IN,"col$i";while(<IN>){chomp;($a,$c)=split(/\t+/,$_);$DATA{$a}{$i}=$c;}close IN;} \
        END {foreach $a ( reverse sort keys %DATA) {foreach $i (1..11) {$DATA{$a}{$i} = 0 if $DATA{$a}{$i} eq "" and $i > 1; \
+
    END{foreach $a(reverse sort keys %DATA){foreach $i(1..11){$DATA{$a}{$i}=0 if $DATA{$a}{$i} eq "" and $i>1; \
        print $DATA{$a}{$i}; print "\t" if $i < 11;}print "\n";}}' \
+
    print $DATA{$a}{$i};print "\t" if $i<11;}print "\n";}}' \
 
> final.dat
 
> final.dat
  
 
R
 
R
 
postscript ('final.ps')
 
postscript ('final.ps')
source ( "/home/people/kiil/Exercises/files/heatmap.R" )
+
source("/path/to/heatmap.R")
data <- as.matrix ( read.table ( 'final.dat' , row.names=1, header= TRUE,sep = "\t" ) )
+
data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t"))
pal <- maPalette(low = rgb(10,10,10,maxColorValue = 10), high = rgb(0,0,10,maxColorValue = 10), mid= rgb(5,5,10,maxColorValue = 10),, k = 1024)
+
pal<-maPalette(low=rgb(10,10,10,maxColorValue=10),high=rgb(0,0,10,maxColorValue=10),mid=rgb(5,5,10,maxColorValue=10),, k=1024)
par(oma=c(0, 0, 0, 0.5))
+
par(oma=c(0,0,0,0.5))
data <- scale(data)
+
data<-scale(data)
new_heatmap ( data , cexCol=1,cexRow=1,colors = pal)
+
new_heatmap(data,cexCol=1,cexRow=1,colors=pal)
 
dev.off()
 
dev.off()
 
quit(save="no")
 
quit(save="no")
Line 101: Line 101:
 
  perl -ne 'print if 15 .. 17' *.gff
 
  perl -ne 'print if 15 .. 17' *.gff
 
*Count lines:
 
*Count lines:
  perl -ne '$counter++; END { print "$counter lines"; }' datafile
+
  perl -ne '$counter++;END{print "$counter lines";}' datafile
 
*Split a multi-sequence FASTA file into individual files:
 
*Split a multi-sequence FASTA file into individual files:
  perl -ne 'BEGIN{ $/=">"; } if(/^\s*(\S+)/){ open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">", $_ }'
+
  perl -ne 'BEGIN{$/=">";} if(/^\s*(\S+)/){open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">",$_}'
  
 
==Transpose matrix==
 
==Transpose matrix==
Line 153: Line 153:
 
use strict;
 
use strict;
 
my @queue;
 
my @queue;
unshift (@queue, "Customer 1"); # @queue is now ("Customer 1")
+
unshift(@queue,"Customer 1"); # @queue is now ("Customer 1")
unshift (@queue, "Customer 2"); # @queue is now ("Customer 2" "Customer 1")
+
unshift(@queue,"Customer 2"); # @queue is now ("Customer 2" "Customer 1")
unshift (@queue, "Customer 3");
+
unshift(@queue,"Customer 3");
 
           # @queue is now ("Customer 3" "Customer 2" "Customer 1")
 
           # @queue is now ("Customer 3" "Customer 2" "Customer 1")
my $item = pop(@queue);        # @queue is now ("Customer 3" "Customer 2")
+
my $item=pop(@queue);        # @queue is now ("Customer 3" "Customer 2")
print "Servicing $item\n";     # prints:  Servicing Customer 1\n
+
print "Servicing $item\n";   # prints:  Servicing Customer 1\n
$item = pop(@queue);            # @queue is now ("Customer 3")
+
$item=pop(@queue);            # @queue is now ("Customer 3")
print "Servicing $item\n";     # prints:  Servicing Customer 2\n
+
print "Servicing $item\n";   # prints:  Servicing Customer 2\n
 
</pre>
 
</pre>
 
<pre>
 
<pre>
 
use strict;
 
use strict;
 
my @notAqueue;
 
my @notAqueue;
unshift(@notAqueue, "Customer 0", "Customer 1");
+
unshift(@notAqueue,"Customer 0","Customer 1");
                                # @queue is now ("Customer 0", "Customer 1")
+
          # @queue is now ("Customer 0","Customer 1")
unshift (@notAqueue, "Customer 2");
+
unshift(@notAqueue,"Customer 2");
           # @queue is now ("Customer 2", "Customer 0", "Customer 1")
+
           # @queue is now ("Customer 2","Customer 0","Customer 1")
 
</pre>
 
</pre>
  
 
==External links==
 
==External links==
 
*[http://sysbio.harvard.edu/csb/resources/computational/scriptome/UNIX/Tools/Change.html]
 
*[http://sysbio.harvard.edu/csb/resources/computational/scriptome/UNIX/Tools/Change.html]

Revision as of 04:58, 1 July 2007

Perl and MySQL

This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the CLI.

Note: These examples were taken from my course in Comparative Microbial Genomics at CBS (in Denmark).

mysql -B -e "update cmp_genomics.features set note = '' where user = USER() and note not like 'tcs%' or note is null"
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
   foreach type (ecf s54 s70)
     cat source/$accession.proteins.$type.sigmas.hmmsearch | \
     perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\
        my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\
        next unless $score>0;\
        print "update cmp_genomics.features set note=\"Sigma Factor '$type'\" \
           where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\
     |mysql
   end
end
mysql -B -e "update cmp_genomics.features set note='' where user = USER() and note not like 'sigma%' or note is null"
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
   foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK)
     cat source/$accession.$type.TCS.hmmsearch | \
     perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\
        my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\
        next unless $score>0; \
        print "update cmp_genomics.features set note=\"TCS '$type'\" \
           where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\
     |mysql -B
   end
end
rm -f source/all.16s.fsa
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
   mysql -B -N -D cmp_genomics -e "select substr(seq,start,stop-start+1) as sequence,\
     replace(substring_index(genomes.organism,' ',4),' ','_') as name,dir from genomes,features where \
     genomes.accession = features.accession and genomes.user = features.user and \
     features.user = user() and featuretype='rRNA' and features.accession = '$accession'" | \
   perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\
     print ">$F[1]\n"; \
     for($x=0;$x<length($F[0]);$x=$x+60){print substr($F[0],$x,60),"\n";}last;' \
   >> source/all.16s.fsa
end
less source/all.16s.fsa
cat CP000034.codon-usage.aa.list \
| perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \
> aa.dat
mysql -B -N -e 'select accession,organism from cmp_genomics.genomes where user=USER()' \
> organism.names
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
  cat $accession.codon-usage.aa.list \
  |perl -ane 'BEGIN{foreach(`cat organism.names`) \
       {chomp;@A=split(/\t+/,$_);$NAMES{$A[0]}=$A[1];}} \
       next unless $F[1]=~/^[A-Z]+/;$FREQ{$F[1]}=$F[3];push @AA,$F[1]; \
       END{print "\n";print "$NAMES{'$accession'}";foreach $aa(@AA){print "\t",$FREQ{$aa};}}' \
  >> aa.dat
end
mysql -e "delete from localization where user=USER() and type='signalp'" cmp_genomics
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034)
  perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \
     {print "insert into localization (user,protid,type) \
          select USER() as user, ft.id as protid, \"signalp\" as type \
          from features as ft \
          where ft.user=USER() and \
          ft.accession=\"'$accession'\" and \
          ft.start=$1 and ft.stop=$2 and \
          ft.dir=\"$3\";\n";}' $accession.signalp \
  |mysql -D cmp_genomics
end
perl -e 'foreach $i(1..11){open IN,"col$i";while(<IN>){chomp;($a,$c)=split(/\t+/,$_);$DATA{$a}{$i}=$c;}close IN;} \
     END{foreach $a(reverse sort keys %DATA){foreach $i(1..11){$DATA{$a}{$i}=0 if $DATA{$a}{$i} eq "" and $i>1; \
     print $DATA{$a}{$i};print "\t" if $i<11;}print "\n";}}' \
> final.dat

R
postscript ('final.ps')
source("/path/to/heatmap.R")
data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t"))
pal<-maPalette(low=rgb(10,10,10,maxColorValue=10),high=rgb(0,0,10,maxColorValue=10),mid=rgb(5,5,10,maxColorValue=10),, k=1024)
par(oma=c(0,0,0,0.5))
data<-scale(data)
new_heatmap(data,cexCol=1,cexRow=1,colors=pal)
dev.off()
quit(save="no")

Perl and bioinformatics

  • Complement every base in the file foo.fsa:
perl -pe 'tr/ATCG/TAGC/' foo.fsa
  • Add first and fourth columns:
perl -ane 'print $F[0] + $F[3], "\n"' datafile
  • Only print lines 15 - 17:
perl -ne 'print if 15 .. 17' *.gff
  • Count lines:
perl -ne '$counter++;END{print "$counter lines";}' datafile
  • Split a multi-sequence FASTA file into individual files:
perl -ne 'BEGIN{$/=">";} if(/^\s*(\S+)/){open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">",$_}'

Transpose matrix

Note: The following could be used to transpose a CSV file.

#!/bin/sh
# Example CSV file
# FROM:
#  a1,a2,a3
#  b1,b2,b3
# TO:
#  a1,b1
#  a2,b2
#  a3,b3

perl -e '$unequal=0;$_=<>;s/\r?\n//;@out_rows=split(/\,/, $_);\
$num_out_rows=$#out_rows+1;while(<>){s/\r?\n//;@F=split(/\,/,$_);\
foreach $i (0 .. $#F){$out_rows[$i] .="\,$F[$i]";}\
if($num_out_rows !=$#F+1){$unequal=1;}} END\
{foreach $row (@out_rows){print"$row\n"} \
warn "\nWARNING! Rows in input had different numbers of columns\n" \
if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1

Misc

  • Change columns in each line of files:
perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab
  • Change tab-separated data to use a different delimiter:
perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv
  • Remove spaces in a line:
perl -e 'while(<>){s/ //g;print $_;}' foo
  • Change all characters to upper case:
perl -e 'while(<>){print uc($_);}' foo
  • Change a FASTA file into tabular format:
perl -e '$count=0; $len=0; while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";}else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab
  • Change a tabular file into FASTA format:
perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]";if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s);$s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa
  • Change from one biological format to another:
perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0;for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat);$out=Bio::SeqIO->newFh(-format =>$outformat);while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa

Arrays as Queues

use strict;
my @queue;
unshift(@queue,"Customer 1"); # @queue is now ("Customer 1")
unshift(@queue,"Customer 2"); # @queue is now ("Customer 2" "Customer 1")
unshift(@queue,"Customer 3");
          # @queue is now ("Customer 3" "Customer 2" "Customer 1")
my $item=pop(@queue);         # @queue is now ("Customer 3" "Customer 2")
print "Servicing $item\n";    # prints:  Servicing Customer 1\n
$item=pop(@queue);            # @queue is now ("Customer 3")
print "Servicing $item\n";    # prints:  Servicing Customer 2\n
use strict;
my @notAqueue;
unshift(@notAqueue,"Customer 0","Customer 1");
          # @queue is now ("Customer 0","Customer 1")
unshift(@notAqueue,"Customer 2");
          # @queue is now ("Customer 2","Customer 0","Customer 1")

External links