Difference between revisions of "Talk:Perl"
From Christoph's Personal Wiki
Line 1: | Line 1: | ||
+ | ==Perl and bioinformatics== | ||
+ | *Complement every base in the file foo.fsa: | ||
+ | perl -pe 'tr/ATCG/TAGC/' foo.fsa | ||
+ | *Add first and fourth columns: | ||
+ | perl -ane 'print $F[0] + $F[3], "\n"' datafile | ||
+ | *Only print lines 15 - 17: | ||
+ | perl -ne 'print if 15 .. 17' *.gff | ||
+ | *Count lines: | ||
+ | perl -ne '$counter++;END{print "$counter lines";}' datafile | ||
+ | *Split a multi-sequence FASTA file into individual files: | ||
+ | perl -ne 'BEGIN{$/=">";} if(/^\s*(\S+)/){open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">",$_}' | ||
+ | |||
+ | ==Transpose matrix== | ||
+ | ''Note: The following could be used to transpose a CSV file.'' | ||
+ | <pre> | ||
+ | #!/bin/sh | ||
+ | # Example CSV file | ||
+ | # FROM: | ||
+ | # a1,a2,a3 | ||
+ | # b1,b2,b3 | ||
+ | # TO: | ||
+ | # a1,b1 | ||
+ | # a2,b2 | ||
+ | # a3,b3 | ||
+ | |||
+ | perl -e '$unequal=0;$_=<>;s/\r?\n//;@out_rows=split(/\,/, $_);\ | ||
+ | $num_out_rows=$#out_rows+1;while(<>){s/\r?\n//;@F=split(/\,/,$_);\ | ||
+ | foreach $i (0 .. $#F){$out_rows[$i] .="\,$F[$i]";}\ | ||
+ | if($num_out_rows !=$#F+1){$unequal=1;}} END\ | ||
+ | {foreach $row (@out_rows){print"$row\n"} \ | ||
+ | warn "\nWARNING! Rows in input had different numbers of columns\n" \ | ||
+ | if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1 | ||
+ | </pre> | ||
+ | |||
+ | ==Misc== | ||
+ | *Change columns in each line of files: | ||
+ | perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab | ||
+ | |||
+ | *Change tab-separated data to use a different delimiter: | ||
+ | perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv | ||
+ | |||
+ | *Remove spaces in a line: | ||
+ | perl -e 'while(<>){s/ //g;print $_;}' foo | ||
+ | |||
+ | *Change all characters to upper case: | ||
+ | perl -e 'while(<>){print uc($_);}' foo | ||
+ | |||
+ | *Change a FASTA file into tabular format: | ||
+ | perl -e '$count=0; $len=0; while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";}else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab | ||
+ | |||
+ | *Change a tabular file into FASTA format: | ||
+ | perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]";if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s);$s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa | ||
+ | |||
+ | *Change from one biological format to another: | ||
+ | perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0;for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat);$out=Bio::SeqIO->newFh(-format =>$outformat);while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa | ||
+ | |||
+ | ==Arrays as Queues== | ||
+ | <pre> | ||
+ | use strict; | ||
+ | my @queue; | ||
+ | unshift(@queue,"Customer 1"); # @queue is now ("Customer 1") | ||
+ | unshift(@queue,"Customer 2"); # @queue is now ("Customer 2" "Customer 1") | ||
+ | unshift(@queue,"Customer 3"); | ||
+ | # @queue is now ("Customer 3" "Customer 2" "Customer 1") | ||
+ | my $item=pop(@queue); # @queue is now ("Customer 3" "Customer 2") | ||
+ | print "Servicing $item\n"; # prints: Servicing Customer 1\n | ||
+ | $item=pop(@queue); # @queue is now ("Customer 3") | ||
+ | print "Servicing $item\n"; # prints: Servicing Customer 2\n | ||
+ | </pre> | ||
+ | <pre> | ||
+ | use strict; | ||
+ | my @notAqueue; | ||
+ | unshift(@notAqueue,"Customer 0","Customer 1"); | ||
+ | # @queue is now ("Customer 0","Customer 1") | ||
+ | unshift(@notAqueue,"Customer 2"); | ||
+ | # @queue is now ("Customer 2","Customer 0","Customer 1") | ||
+ | </pre> | ||
+ | |||
==Perl and [[MySQL]]== | ==Perl and [[MySQL]]== | ||
This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the [[:Category:Linux Command Line Tools|CLI]]. | This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the [[:Category:Linux Command Line Tools|CLI]]. | ||
Line 5: | Line 83: | ||
<pre> | <pre> | ||
− | mysql -B -e "update cmp_genomics.features set note = '' where user = USER() and note not like 'tcs%' or note is null" | + | mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'tcs%' or note is null" |
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
foreach type (ecf s54 s70) | foreach type (ecf s54 s70) | ||
Line 19: | Line 97: | ||
</pre> | </pre> | ||
<pre> | <pre> | ||
− | mysql -B -e "update cmp_genomics.features set note='' where user = USER() and note not like 'sigma%' or note is null" | + | mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'sigma%' or note is null" |
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK) | foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK) | ||
Line 33: | Line 111: | ||
</pre> | </pre> | ||
<pre> | <pre> | ||
− | rm -f source/all.16s.fsa | + | rm -f source/all.16s.fsa # in case any exist (from previous runs) |
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
− | mysql -B -N -D cmp_genomics -e " | + | mysql -B -N -D cmp_genomics -e "SELECT SUBSTR(seq,start,stop-start+1) AS sequence,\ |
− | + | REPLACE(substring_index(genomes.organism,' ',4),' ','_') AS name,dir from genomes,features \ | |
− | genomes.accession = features.accession | + | WHERE genomes.accession=features.accession AND genomes.user=features.user AND \ |
− | features.user = user() | + | features.user=user() AND featuretype='rRNA' AND features.accession='$accession'" | \ |
perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\ | perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\ | ||
print ">$F[1]\n"; \ | print ">$F[1]\n"; \ | ||
Line 65: | Line 143: | ||
foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \ | perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \ | ||
− | {print " | + | {print "INSERT INTO localization (user,protid,type) \ |
− | + | SELECT USER() AS user, ft.id AS protid, \"signalp\" AS type \ | |
− | + | FROM features AS ft \ | |
− | + | WHERE ft.user=USER() AND \ | |
− | ft.accession=\"'$accession'\" | + | ft.accession=\"'$accession'\" AND \ |
− | ft.start=$1 and ft.stop=$2 | + | ft.start=$1 and ft.stop=$2 AND \ |
ft.dir=\"$3\";\n";}' $accession.signalp \ | ft.dir=\"$3\";\n";}' $accession.signalp \ | ||
|mysql -D cmp_genomics | |mysql -D cmp_genomics | ||
Line 82: | Line 160: | ||
R | R | ||
− | postscript ('final.ps') | + | postscript('final.ps') |
source("/path/to/heatmap.R") | source("/path/to/heatmap.R") | ||
data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t")) | data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t")) | ||
− | pal<-maPalette(low=rgb(10,10,10,maxColorValue=10),high=rgb(0,0,10,maxColorValue=10),mid=rgb(5,5,10,maxColorValue=10),, k=1024) | + | pal<-maPalette(low=rgb(10,10,10,maxColorValue=10), |
+ | high=rgb(0,0,10,maxColorValue=10), | ||
+ | mid=rgb(5,5,10,maxColorValue=10),, k=1024) | ||
par(oma=c(0,0,0,0.5)) | par(oma=c(0,0,0,0.5)) | ||
data<-scale(data) | data<-scale(data) | ||
Line 91: | Line 171: | ||
dev.off() | dev.off() | ||
quit(save="no") | quit(save="no") | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
</pre> | </pre> | ||
==External links== | ==External links== | ||
*[http://sysbio.harvard.edu/csb/resources/computational/scriptome/UNIX/Tools/Change.html] | *[http://sysbio.harvard.edu/csb/resources/computational/scriptome/UNIX/Tools/Change.html] |
Revision as of 08:40, 8 September 2007
Contents
Perl and bioinformatics
- Complement every base in the file foo.fsa:
perl -pe 'tr/ATCG/TAGC/' foo.fsa
- Add first and fourth columns:
perl -ane 'print $F[0] + $F[3], "\n"' datafile
- Only print lines 15 - 17:
perl -ne 'print if 15 .. 17' *.gff
- Count lines:
perl -ne '$counter++;END{print "$counter lines";}' datafile
- Split a multi-sequence FASTA file into individual files:
perl -ne 'BEGIN{$/=">";} if(/^\s*(\S+)/){open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">",$_}'
Transpose matrix
Note: The following could be used to transpose a CSV file.
#!/bin/sh # Example CSV file # FROM: # a1,a2,a3 # b1,b2,b3 # TO: # a1,b1 # a2,b2 # a3,b3 perl -e '$unequal=0;$_=<>;s/\r?\n//;@out_rows=split(/\,/, $_);\ $num_out_rows=$#out_rows+1;while(<>){s/\r?\n//;@F=split(/\,/,$_);\ foreach $i (0 .. $#F){$out_rows[$i] .="\,$F[$i]";}\ if($num_out_rows !=$#F+1){$unequal=1;}} END\ {foreach $row (@out_rows){print"$row\n"} \ warn "\nWARNING! Rows in input had different numbers of columns\n" \ if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1
Misc
- Change columns in each line of files:
perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab
- Change tab-separated data to use a different delimiter:
perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv
- Remove spaces in a line:
perl -e 'while(<>){s/ //g;print $_;}' foo
- Change all characters to upper case:
perl -e 'while(<>){print uc($_);}' foo
- Change a FASTA file into tabular format:
perl -e '$count=0; $len=0; while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";}else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab
- Change a tabular file into FASTA format:
perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]";if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s);$s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa
- Change from one biological format to another:
perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0;for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat);$out=Bio::SeqIO->newFh(-format =>$outformat);while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa
Arrays as Queues
use strict; my @queue; unshift(@queue,"Customer 1"); # @queue is now ("Customer 1") unshift(@queue,"Customer 2"); # @queue is now ("Customer 2" "Customer 1") unshift(@queue,"Customer 3"); # @queue is now ("Customer 3" "Customer 2" "Customer 1") my $item=pop(@queue); # @queue is now ("Customer 3" "Customer 2") print "Servicing $item\n"; # prints: Servicing Customer 1\n $item=pop(@queue); # @queue is now ("Customer 3") print "Servicing $item\n"; # prints: Servicing Customer 2\n
use strict; my @notAqueue; unshift(@notAqueue,"Customer 0","Customer 1"); # @queue is now ("Customer 0","Customer 1") unshift(@notAqueue,"Customer 2"); # @queue is now ("Customer 2","Customer 0","Customer 1")
Perl and MySQL
This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the CLI.
Note: These examples were taken from my course in Comparative Microbial Genomics at CBS (in Denmark).
mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'tcs%' or note is null" foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) foreach type (ecf s54 s70) cat source/$accession.proteins.$type.sigmas.hmmsearch | \ perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\ next unless $score>0;\ print "update cmp_genomics.features set note=\"Sigma Factor '$type'\" \ where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\ |mysql end end
mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'sigma%' or note is null" foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK) cat source/$accession.$type.TCS.hmmsearch | \ perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\ next unless $score>0; \ print "update cmp_genomics.features set note=\"TCS '$type'\" \ where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\ |mysql -B end end
rm -f source/all.16s.fsa # in case any exist (from previous runs) foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) mysql -B -N -D cmp_genomics -e "SELECT SUBSTR(seq,start,stop-start+1) AS sequence,\ REPLACE(substring_index(genomes.organism,' ',4),' ','_') AS name,dir from genomes,features \ WHERE genomes.accession=features.accession AND genomes.user=features.user AND \ features.user=user() AND featuretype='rRNA' AND features.accession='$accession'" | \ perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\ print ">$F[1]\n"; \ for($x=0;$x<length($F[0]);$x=$x+60){print substr($F[0],$x,60),"\n";}last;' \ >> source/all.16s.fsa end less source/all.16s.fsa
cat CP000034.codon-usage.aa.list \ | perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \ > aa.dat mysql -B -N -e 'select accession,organism from cmp_genomics.genomes where user=USER()' \ > organism.names foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) cat $accession.codon-usage.aa.list \ |perl -ane 'BEGIN{foreach(`cat organism.names`) \ {chomp;@A=split(/\t+/,$_);$NAMES{$A[0]}=$A[1];}} \ next unless $F[1]=~/^[A-Z]+/;$FREQ{$F[1]}=$F[3];push @AA,$F[1]; \ END{print "\n";print "$NAMES{'$accession'}";foreach $aa(@AA){print "\t",$FREQ{$aa};}}' \ >> aa.dat end
mysql -e "delete from localization where user=USER() and type='signalp'" cmp_genomics foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \ {print "INSERT INTO localization (user,protid,type) \ SELECT USER() AS user, ft.id AS protid, \"signalp\" AS type \ FROM features AS ft \ WHERE ft.user=USER() AND \ ft.accession=\"'$accession'\" AND \ ft.start=$1 and ft.stop=$2 AND \ ft.dir=\"$3\";\n";}' $accession.signalp \ |mysql -D cmp_genomics end
perl -e 'foreach $i(1..11){open IN,"col$i";while(<IN>){chomp;($a,$c)=split(/\t+/,$_);$DATA{$a}{$i}=$c;}close IN;} \ END{foreach $a(reverse sort keys %DATA){foreach $i(1..11){$DATA{$a}{$i}=0 if $DATA{$a}{$i} eq "" and $i>1; \ print $DATA{$a}{$i};print "\t" if $i<11;}print "\n";}}' \ > final.dat R postscript('final.ps') source("/path/to/heatmap.R") data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t")) pal<-maPalette(low=rgb(10,10,10,maxColorValue=10), high=rgb(0,0,10,maxColorValue=10), mid=rgb(5,5,10,maxColorValue=10),, k=1024) par(oma=c(0,0,0,0.5)) data<-scale(data) new_heatmap(data,cexCol=1,cexRow=1,colors=pal) dev.off() quit(save="no")