Difference between revisions of "Perl/Scripts"
(7 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
The following is a compilation of '''useful one-line scripts for [[Perl]]'''. The original was compiled by Tom Christiansen, however, I have converted it into the wiki format and have added more examples (always adding more). | The following is a compilation of '''useful one-line scripts for [[Perl]]'''. The original was compiled by Tom Christiansen, however, I have converted it into the wiki format and have added more examples (always adding more). | ||
+ | |||
+ | ==Edit a file from the CLI using [[Perl]]== | ||
+ | |||
+ | perl -p -i.bak -e 's|before|after|' filename # backup of original file 'filename.bak' | ||
+ | |||
+ | perl -p -i -e 's|before|after|ig' filename | ||
+ | |||
+ | ;-p : Execute the command for every line of the file. | ||
+ | ;-i : Invokes in place editing. If a backup extension is provided, a backup of the original file will be made with the extension provided. e.g. -i.bak. | ||
+ | ;-e : Invoke command. | ||
==Perl one-liners== | ==Perl one-liners== | ||
Line 70: | Line 80: | ||
*command-line that prints the last 50 lines (expensively) | *command-line that prints the last 50 lines (expensively) | ||
perl -e '@lines = <>; print @lines[ $#lines .. $#lines-50' f1 f2 f3 ... | perl -e '@lines = <>; print @lines[ $#lines .. $#lines-50' f1 f2 f3 ... | ||
+ | |||
+ | *using the ternary operator: | ||
+ | echo "|E..|"|perl -ne 'printf("%s\n",substr($_,1,1)=~/[A-Z]/?substr($_,1,1):"_");' | ||
+ | # prints "E" if that substr is A-Z | ||
+ | # else prints "_" | ||
+ | |||
+ | ==The tr/// operator== | ||
+ | The "tr///" operator performs a substitution on the individual characters in a string. | ||
+ | |||
+ | *Simple example: | ||
+ | #!/usr/bin/perl -w | ||
+ | use strict; | ||
+ | my $text = 'some cheese'; | ||
+ | $text =~ tr/ce/XY/; | ||
+ | print "$text\n"; | ||
+ | |||
+ | *More examples: | ||
+ | $x =~ tr/a/b/; # replace each "a" with a "b". | ||
+ | $x =~ tr/ /_/; # convert spaces to underlines. | ||
+ | $x =~ tr/aeiou/AEIOU/; # capitalise vowels. | ||
+ | $x =~ tr/79/97/; # exchange "7" and "9". | ||
+ | |||
+ | The only characters that have special meaning to "tr///" are the backslash and the dash. The latter indicates a range of characters: | ||
+ | |||
+ | $x =~ tr/0-9/QERTYUIOPX/; # Digits to letters. | ||
+ | $x =~ tr/A-Z/a-z/; # Convert to lowercase. | ||
+ | |||
+ | Actually, the slash also has a special meaning to "tr///". The slash is called the "delimiter", because it indicates the "limit" on the list of characters to substitute. However, we can use most kinds of punctuation in place of the slash. For example: | ||
+ | |||
+ | $x =~ tr!aeiou!AEIOU!; | ||
+ | $x =~ tr:aeiou:AEIOU:; | ||
+ | |||
+ | Note that we can also use parentheses, but the syntax changes a little because parentheses include the idea of containment: | ||
+ | |||
+ | $x =~ tr(aeiou)(AEIOU); | ||
+ | $x =~ tr<aeiou><AEIOU>; | ||
+ | |||
+ | The semantics (meaning) don't change; only the syntax (way of writing it) changes. But even though the delimiter is arbitrary, we still talk about it as "tr///". | ||
+ | |||
+ | "tr///" returns the number of replacements it made: | ||
+ | |||
+ | my $salary = '$1,000,000.00'; # Dollar sign: use single quote! | ||
+ | my $ego = ($salary =~ tr/0/0/); # Count the zeros in salary. | ||
+ | |||
+ | One more thing: "tr///" has an alias: "y///". This is to please users of [[sed]], which uses the "y///" command do to basically what "tr///" does. In Perl, "tr///" and "y///" do exactly the same thing; use whichever you like. Remember: there is more than one way to do it (TIMTOWTDI). | ||
+ | |||
+ | $text =~ tr/0-9/A-J/; # Convert digits to letters. | ||
+ | $text =~ y/0-9/A-J/; # Does exactly the same thing. | ||
+ | |||
+ | ===Options=== | ||
+ | "tr///" can take the following options: | ||
+ | c # complement (invert) the search-list. | ||
+ | d # delete found but un-replaced characters. | ||
+ | s # squash duplicate replaced characters. | ||
+ | |||
+ | These options are specified after the final delimiter, like this: | ||
+ | |||
+ | $x =~ tr/abc/xyz/s; # Note the "s" at the end. | ||
+ | $x =~ tr(abc)(xyz)s; # Same thing, but with parentheses. | ||
+ | $x =~ tr/abc/xy/ds; # Multiple options. | ||
+ | # In the last case, the "z" is missing. You'll see why shortly. | ||
+ | |||
+ | In the last example, we specified both the "d" and the "s" options. The order of the options is not important (i.e., "sd"="ds"). | ||
+ | |||
+ | *Examples: | ||
+ | #!/usr/bin/perl -w | ||
+ | use strict; | ||
+ | |||
+ | ##### The "s" option ##### | ||
+ | my $text = 'good cheese'; | ||
+ | $text =~ tr/eo/eu/s; | ||
+ | print "$text\n"; | ||
+ | # Output is: gud chese | ||
+ | |||
+ | ##### The "d" option ##### | ||
+ | my $big = 'vowels are useful'; | ||
+ | $big =~ tr/aeiou/AEI/d; | ||
+ | print "$big\n"; | ||
+ | # The first three vowels are made uppercase. | ||
+ | # The other two, which have no replacement | ||
+ | # character, are deleted because of the "d". | ||
+ | |||
+ | ==Average numbers== | ||
+ | A program that reads numbers and outputs the average. Here is one such program: | ||
+ | #!/usr/bin/perl -w | ||
+ | use strict; | ||
+ | my $line; | ||
+ | my $sum=0; | ||
+ | my $n=0; | ||
+ | while(defined($line=<STDIN>)){ | ||
+ | $sum+=$line; | ||
+ | $n++; | ||
+ | } | ||
+ | my $average=$sum/$n; | ||
+ | print "The average is $sum/$n = $average\n"; | ||
+ | |||
+ | ==Misc== | ||
+ | *Change columns in each line of files: | ||
+ | perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab | ||
+ | |||
+ | *Change tab-separated data to use a different delimiter: | ||
+ | perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv | ||
+ | |||
+ | *Remove spaces in a line: | ||
+ | perl -e 'while(<>){s/ //g;print $_;}' foo | ||
+ | |||
+ | *Change all characters to upper case: | ||
+ | perl -e 'while(<>){print uc($_);}' foo | ||
+ | |||
+ | *Change a FASTA file into tabular format: | ||
+ | perl -e '$count=0;$len=0;while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){ \ | ||
+ | if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";} \ | ||
+ | else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab | ||
+ | |||
+ | *Change a tabular file into FASTA format: | ||
+ | perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]"; \ | ||
+ | if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s); \ | ||
+ | $s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa | ||
+ | |||
+ | *Change from one biological format to another: | ||
+ | perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0; \ | ||
+ | for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat); \ | ||
+ | $out=Bio::SeqIO->newFh(-format =>$outformat); \ | ||
+ | while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa | ||
+ | |||
+ | ==Arrays as Queues== | ||
+ | <pre> | ||
+ | use strict; | ||
+ | my @queue; | ||
+ | unshift(@queue,"Customer 1"); # @queue is now ("Customer 1") | ||
+ | unshift(@queue,"Customer 2"); # @queue is now ("Customer 2" "Customer 1") | ||
+ | unshift(@queue,"Customer 3"); | ||
+ | # @queue is now ("Customer 3" "Customer 2" "Customer 1") | ||
+ | my $item=pop(@queue); # @queue is now ("Customer 3" "Customer 2") | ||
+ | print "Servicing $item\n"; # prints: Servicing Customer 1\n | ||
+ | $item=pop(@queue); # @queue is now ("Customer 3") | ||
+ | print "Servicing $item\n"; # prints: Servicing Customer 2\n | ||
+ | </pre> | ||
+ | <pre> | ||
+ | use strict; | ||
+ | my @notAqueue; | ||
+ | unshift(@notAqueue,"Customer 0","Customer 1"); | ||
+ | # @queue is now ("Customer 0","Customer 1") | ||
+ | unshift(@notAqueue,"Customer 2"); | ||
+ | # @queue is now ("Customer 2","Customer 0","Customer 1") | ||
+ | </pre> | ||
+ | |||
+ | ==Transpose matrix== | ||
+ | ''Note: The following could be used to transpose a CSV file.'' | ||
+ | <pre> | ||
+ | #!/bin/sh | ||
+ | # Example CSV file | ||
+ | # FROM: | ||
+ | # a1,a2,a3 | ||
+ | # b1,b2,b3 | ||
+ | # TO: | ||
+ | # a1,b1 | ||
+ | # a2,b2 | ||
+ | # a3,b3 | ||
+ | |||
+ | perl -e '$unequal=0;$_=<>;s/\r?\n//;@out_rows=split(/\,/, $_);\ | ||
+ | $num_out_rows=$#out_rows+1;while(<>){s/\r?\n//;@F=split(/\,/,$_);\ | ||
+ | foreach $i (0 .. $#F){$out_rows[$i] .="\,$F[$i]";}\ | ||
+ | if($num_out_rows !=$#F+1){$unequal=1;}} END\ | ||
+ | {foreach $row (@out_rows){print"$row\n"} \ | ||
+ | warn "\nWARNING! Rows in input had different numbers of columns\n" \ | ||
+ | if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1 | ||
+ | </pre> | ||
+ | |||
+ | ==Perl and bioinformatics== | ||
+ | *Complement every base in the file foo.fsa: | ||
+ | perl -pe 'tr/ATCG/TAGC/' foo.fsa | ||
+ | *Add first and fourth columns: | ||
+ | perl -ane 'print $F[0] + $F[3], "\n"' datafile | ||
+ | *Only print lines 15 - 17: | ||
+ | perl -ne 'print if 15 .. 17' *.gff | ||
+ | *Count lines: | ||
+ | perl -ne '$counter++;END{print "$counter lines";}' datafile | ||
+ | *Split a multi-sequence FASTA file into individual files: | ||
+ | perl -ne 'BEGIN{$/=">";} if(/^\s*(\S+)/){open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">",$_}' | ||
+ | |||
+ | ==Perl and [[MySQL]]== | ||
+ | This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the [[:Category:Linux Command Line Tools|CLI]]. | ||
+ | |||
+ | ''Note: These examples were taken from my course in Comparative Microbial Genomics at CBS (in Denmark).'' | ||
+ | |||
+ | <pre> | ||
+ | mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'tcs%' or note is null" | ||
+ | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
+ | foreach type (ecf s54 s70) | ||
+ | cat source/$accession.proteins.$type.sigmas.hmmsearch | \ | ||
+ | perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ | ||
+ | my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\ | ||
+ | next unless $score>0;\ | ||
+ | print "update cmp_genomics.features set note=\"Sigma Factor '$type'\" \ | ||
+ | where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\ | ||
+ | |mysql | ||
+ | end | ||
+ | end | ||
+ | </pre> | ||
+ | <pre> | ||
+ | mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'sigma%' or note is null" | ||
+ | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
+ | foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK) | ||
+ | cat source/$accession.$type.TCS.hmmsearch | \ | ||
+ | perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ | ||
+ | my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\ | ||
+ | next unless $score>0; \ | ||
+ | print "update cmp_genomics.features set note=\"TCS '$type'\" \ | ||
+ | where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\ | ||
+ | |mysql -B | ||
+ | end | ||
+ | end | ||
+ | </pre> | ||
+ | <pre> | ||
+ | rm -f source/all.16s.fsa # in case any exist (from previous runs) | ||
+ | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
+ | mysql -B -N -D cmp_genomics -e "SELECT SUBSTR(seq,start,stop-start+1) AS sequence,\ | ||
+ | REPLACE(substring_index(genomes.organism,' ',4),' ','_') AS name,dir from genomes,features \ | ||
+ | WHERE genomes.accession=features.accession AND genomes.user=features.user AND \ | ||
+ | features.user=user() AND featuretype='rRNA' AND features.accession='$accession'" | \ | ||
+ | perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\ | ||
+ | print ">$F[1]\n"; \ | ||
+ | for($x=0;$x<length($F[0]);$x=$x+60){print substr($F[0],$x,60),"\n";}last;' \ | ||
+ | >> source/all.16s.fsa | ||
+ | end | ||
+ | less source/all.16s.fsa | ||
+ | </pre> | ||
+ | <pre> | ||
+ | cat CP000034.codon-usage.aa.list \ | ||
+ | | perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \ | ||
+ | > aa.dat | ||
+ | mysql -B -N -e 'select accession,organism from cmp_genomics.genomes where user=USER()' \ | ||
+ | > organism.names | ||
+ | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
+ | cat $accession.codon-usage.aa.list \ | ||
+ | |perl -ane 'BEGIN{foreach(`cat organism.names`) \ | ||
+ | {chomp;@A=split(/\t+/,$_);$NAMES{$A[0]}=$A[1];}} \ | ||
+ | next unless $F[1]=~/^[A-Z]+/;$FREQ{$F[1]}=$F[3];push @AA,$F[1]; \ | ||
+ | END{print "\n";print "$NAMES{'$accession'}";foreach $aa(@AA){print "\t",$FREQ{$aa};}}' \ | ||
+ | >> aa.dat | ||
+ | end | ||
+ | </pre> | ||
+ | <pre> | ||
+ | mysql -e "delete from localization where user=USER() and type='signalp'" cmp_genomics | ||
+ | foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) | ||
+ | perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \ | ||
+ | {print "INSERT INTO localization (user,protid,type) \ | ||
+ | SELECT USER() AS user, ft.id AS protid, \"signalp\" AS type \ | ||
+ | FROM features AS ft \ | ||
+ | WHERE ft.user=USER() AND \ | ||
+ | ft.accession=\"'$accession'\" AND \ | ||
+ | ft.start=$1 and ft.stop=$2 AND \ | ||
+ | ft.dir=\"$3\";\n";}' $accession.signalp \ | ||
+ | |mysql -D cmp_genomics | ||
+ | end | ||
+ | </pre> | ||
+ | <pre> | ||
+ | perl -e 'foreach $i(1..11){open IN,"col$i";while(<IN>){chomp;($a,$c)=split(/\t+/,$_);$DATA{$a}{$i}=$c;}close IN;} \ | ||
+ | END{foreach $a(reverse sort keys %DATA){foreach $i(1..11){$DATA{$a}{$i}=0 if $DATA{$a}{$i} eq "" and $i>1; \ | ||
+ | print $DATA{$a}{$i};print "\t" if $i<11;}print "\n";}}' \ | ||
+ | > final.dat | ||
+ | |||
+ | R | ||
+ | postscript('final.ps') | ||
+ | source("/path/to/heatmap.R") | ||
+ | data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t")) | ||
+ | pal<-maPalette(low=rgb(10,10,10,maxColorValue=10), | ||
+ | high=rgb(0,0,10,maxColorValue=10), | ||
+ | mid=rgb(5,5,10,maxColorValue=10),, k=1024) | ||
+ | par(oma=c(0,0,0,0.5)) | ||
+ | data<-scale(data) | ||
+ | new_heatmap(data,cexCol=1,cexRow=1,colors=pal) | ||
+ | dev.off() | ||
+ | quit(save="no") | ||
+ | </pre> | ||
==External links== | ==External links== | ||
*[http://www.ibm.com/developerworks/linux/library/l-p101/ Cultured Perl: One-liners 101] | *[http://www.ibm.com/developerworks/linux/library/l-p101/ Cultured Perl: One-liners 101] | ||
*[http://www.ibm.com/developerworks/linux/library/l-p102.html Cultured Perl: One-liners 102] | *[http://www.ibm.com/developerworks/linux/library/l-p102.html Cultured Perl: One-liners 102] | ||
+ | *[http://homepages.cwi.nl/~rooij/programming/oneliners.html Oneliners and other very short programs] | ||
+ | *[http://www.perlmonks.org/?node_id=135345 Perl's pearls] | ||
+ | *[http://www.ajs.com/ajswiki/Perl_one-liners Perl one-liners] | ||
+ | *[http://www.visualgenomics.ca/gordonp/oneliners.html Perl one-liners for the bioinformatician] | ||
+ | *[http://sial.org/howto/perl/one-liner/ Perl One Liners] | ||
+ | *[http://www.amazon.com/o/ASIN/1932394508/sialorg-20/ref=nosim Minimal Perl: For UNIX and Linux People] | ||
+ | *[http://sysbio.harvard.edu/csb/resources/computational/scriptome/UNIX/Tools/Change.html] | ||
[[Category:Scripting languages]] | [[Category:Scripting languages]] |
Latest revision as of 06:45, 24 September 2007
The following is a compilation of useful one-line scripts for Perl. The original was compiled by Tom Christiansen, however, I have converted it into the wiki format and have added more examples (always adding more).
Contents
Edit a file from the CLI using Perl
perl -p -i.bak -e 's|before|after|' filename # backup of original file 'filename.bak'
perl -p -i -e 's|before|after|ig' filename
- -p
- Execute the command for every line of the file.
- -i
- Invokes in place editing. If a backup extension is provided, a backup of the original file will be made with the extension provided. e.g. -i.bak.
- -e
- Invoke command.
Perl one-liners
- run contents of "my_file" as a program
perl my_file
- run debugger "stand-alone"
perl -d -e 42
- run program, but with warnings
perl -w my_file
- run program under debugger
perl -d my_file
- just check syntax, with warnings
perl -wc my_file
- useful at end of "find foo -print"
perl -nle unlink
- simplest one-liner program
perl -e 'print "hello world!\n"'
- add first and penultimate columns
perl -lane 'print $F[0] + $F[-2]'
- just lines 15 to 17
perl -ne 'print if 15 .. 17' *.pod
- in-place edit of *.c files changing all foo to bar
perl -p -i.bak -e 's/\bfoo\b/bar/g' *.c
- command-line that prints the first 50 lines (cheaply)
perl -pe 'exit if $. > 50' f1 f2 f3 ...
- delete first 10 lines
perl -i.old -ne 'print unless 1 .. 10' foo.txt
- change all the isolated oldvar occurrences to newvar
perl -i.old -pe 's{\boldvar\b}{newvar}g' *.[chy]
- command-line that reverses the whole file by lines
perl -e 'print reverse <>' file1 file2 file3 ....
- find palindromes
perl -lne 'print if $_ eq reverse' /usr/dict/words
- command-line that reverse all the bytes in a file
perl -0777e 'print scalar reverse <>' f1 f2 f3 ...
- command-line that reverses the whole file by paragraphs
perl -00 -e 'print reverse <>' file1 file2 file3 ....
- increment all numbers found in these files
perl i.tiny -pe 's/(\d+)/ 1 + $1 /ge' file1 file2 ....
- command-line that shows each line with its characters backwards
perl -nle 'print scalar reverse $_' file1 file2 file3 ....
- delete all but lines between START and END
perl -i.old -ne 'print unless /^START$/ .. /^END$/' foo.txt
- binary edit (careful!)
perl -i.bak -pe 's/Mozilla/Slopoke/g' /usr/local/bin/netscape
- look for dup words
perl -0777 -ne 'print "$.: doubled $_\n" while /\b(\w+)\b\s+\b\1\b/gi'
- command-line that prints the last 50 lines (expensively)
perl -e '@lines = <>; print @lines[ $#lines .. $#lines-50' f1 f2 f3 ...
- using the ternary operator:
echo "|E..|"|perl -ne 'printf("%s\n",substr($_,1,1)=~/[A-Z]/?substr($_,1,1):"_");' # prints "E" if that substr is A-Z # else prints "_"
The tr/// operator
The "tr///" operator performs a substitution on the individual characters in a string.
- Simple example:
#!/usr/bin/perl -w use strict; my $text = 'some cheese'; $text =~ tr/ce/XY/; print "$text\n";
- More examples:
$x =~ tr/a/b/; # replace each "a" with a "b". $x =~ tr/ /_/; # convert spaces to underlines. $x =~ tr/aeiou/AEIOU/; # capitalise vowels. $x =~ tr/79/97/; # exchange "7" and "9".
The only characters that have special meaning to "tr///" are the backslash and the dash. The latter indicates a range of characters:
$x =~ tr/0-9/QERTYUIOPX/; # Digits to letters. $x =~ tr/A-Z/a-z/; # Convert to lowercase.
Actually, the slash also has a special meaning to "tr///". The slash is called the "delimiter", because it indicates the "limit" on the list of characters to substitute. However, we can use most kinds of punctuation in place of the slash. For example:
$x =~ tr!aeiou!AEIOU!; $x =~ tr:aeiou:AEIOU:;
Note that we can also use parentheses, but the syntax changes a little because parentheses include the idea of containment:
$x =~ tr(aeiou)(AEIOU); $x =~ tr<aeiou><AEIOU>;
The semantics (meaning) don't change; only the syntax (way of writing it) changes. But even though the delimiter is arbitrary, we still talk about it as "tr///".
"tr///" returns the number of replacements it made:
my $salary = '$1,000,000.00'; # Dollar sign: use single quote! my $ego = ($salary =~ tr/0/0/); # Count the zeros in salary.
One more thing: "tr///" has an alias: "y///". This is to please users of sed, which uses the "y///" command do to basically what "tr///" does. In Perl, "tr///" and "y///" do exactly the same thing; use whichever you like. Remember: there is more than one way to do it (TIMTOWTDI).
$text =~ tr/0-9/A-J/; # Convert digits to letters. $text =~ y/0-9/A-J/; # Does exactly the same thing.
Options
"tr///" can take the following options:
c # complement (invert) the search-list. d # delete found but un-replaced characters. s # squash duplicate replaced characters.
These options are specified after the final delimiter, like this:
$x =~ tr/abc/xyz/s; # Note the "s" at the end. $x =~ tr(abc)(xyz)s; # Same thing, but with parentheses. $x =~ tr/abc/xy/ds; # Multiple options. # In the last case, the "z" is missing. You'll see why shortly.
In the last example, we specified both the "d" and the "s" options. The order of the options is not important (i.e., "sd"="ds").
- Examples:
#!/usr/bin/perl -w use strict; ##### The "s" option ##### my $text = 'good cheese'; $text =~ tr/eo/eu/s; print "$text\n"; # Output is: gud chese ##### The "d" option ##### my $big = 'vowels are useful'; $big =~ tr/aeiou/AEI/d; print "$big\n"; # The first three vowels are made uppercase. # The other two, which have no replacement # character, are deleted because of the "d".
Average numbers
A program that reads numbers and outputs the average. Here is one such program:
#!/usr/bin/perl -w use strict; my $line; my $sum=0; my $n=0; while(defined($line=<STDIN>)){ $sum+=$line; $n++; } my $average=$sum/$n; print "The average is $sum/$n = $average\n";
Misc
- Change columns in each line of files:
perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab
- Change tab-separated data to use a different delimiter:
perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv
- Remove spaces in a line:
perl -e 'while(<>){s/ //g;print $_;}' foo
- Change all characters to upper case:
perl -e 'while(<>){print uc($_);}' foo
- Change a FASTA file into tabular format:
perl -e '$count=0;$len=0;while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){ \ if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";} \ else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab
- Change a tabular file into FASTA format:
perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]"; \ if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s); \ $s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa
- Change from one biological format to another:
perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0; \ for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat); \ $out=Bio::SeqIO->newFh(-format =>$outformat); \ while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa
Arrays as Queues
use strict; my @queue; unshift(@queue,"Customer 1"); # @queue is now ("Customer 1") unshift(@queue,"Customer 2"); # @queue is now ("Customer 2" "Customer 1") unshift(@queue,"Customer 3"); # @queue is now ("Customer 3" "Customer 2" "Customer 1") my $item=pop(@queue); # @queue is now ("Customer 3" "Customer 2") print "Servicing $item\n"; # prints: Servicing Customer 1\n $item=pop(@queue); # @queue is now ("Customer 3") print "Servicing $item\n"; # prints: Servicing Customer 2\n
use strict; my @notAqueue; unshift(@notAqueue,"Customer 0","Customer 1"); # @queue is now ("Customer 0","Customer 1") unshift(@notAqueue,"Customer 2"); # @queue is now ("Customer 2","Customer 0","Customer 1")
Transpose matrix
Note: The following could be used to transpose a CSV file.
#!/bin/sh # Example CSV file # FROM: # a1,a2,a3 # b1,b2,b3 # TO: # a1,b1 # a2,b2 # a3,b3 perl -e '$unequal=0;$_=<>;s/\r?\n//;@out_rows=split(/\,/, $_);\ $num_out_rows=$#out_rows+1;while(<>){s/\r?\n//;@F=split(/\,/,$_);\ foreach $i (0 .. $#F){$out_rows[$i] .="\,$F[$i]";}\ if($num_out_rows !=$#F+1){$unequal=1;}} END\ {foreach $row (@out_rows){print"$row\n"} \ warn "\nWARNING! Rows in input had different numbers of columns\n" \ if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1
Perl and bioinformatics
- Complement every base in the file foo.fsa:
perl -pe 'tr/ATCG/TAGC/' foo.fsa
- Add first and fourth columns:
perl -ane 'print $F[0] + $F[3], "\n"' datafile
- Only print lines 15 - 17:
perl -ne 'print if 15 .. 17' *.gff
- Count lines:
perl -ne '$counter++;END{print "$counter lines";}' datafile
- Split a multi-sequence FASTA file into individual files:
perl -ne 'BEGIN{$/=">";} if(/^\s*(\S+)/){open(F,">$1.fsa")||warn"$1 write failed:$!\n";chomp;print F ">",$_}'
Perl and MySQL
This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the CLI.
Note: These examples were taken from my course in Comparative Microbial Genomics at CBS (in Denmark).
mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'tcs%' or note is null" foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) foreach type (ecf s54 s70) cat source/$accession.proteins.$type.sigmas.hmmsearch | \ perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\ next unless $score>0;\ print "update cmp_genomics.features set note=\"Sigma Factor '$type'\" \ where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\ |mysql end end
mysql -B -e "update cmp_genomics.features set note='' where user=USER() and note not like 'sigma%' or note is null" foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK) cat source/$accession.$type.TCS.hmmsearch | \ perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ my($start,$stop,$dir,$score,$evalue)=($1,$2,$3,$4,$5);\ next unless $score>0; \ print "update cmp_genomics.features set note=\"TCS '$type'\" \ where start=$start and stop=$stop and user=user() and accession=\"'$accession'\";\n";'\ |mysql -B end end
rm -f source/all.16s.fsa # in case any exist (from previous runs) foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) mysql -B -N -D cmp_genomics -e "SELECT SUBSTR(seq,start,stop-start+1) AS sequence,\ REPLACE(substring_index(genomes.organism,' ',4),' ','_') AS name,dir from genomes,features \ WHERE genomes.accession=features.accession AND genomes.user=features.user AND \ features.user=user() AND featuretype='rRNA' AND features.accession='$accession'" | \ perl -ane '$F[0]=reverse($F[0]) if $F[2] eq "-";$F[0]=~tr/ATGC/TACG/ if $F[2] eq "-";\ print ">$F[1]\n"; \ for($x=0;$x<length($F[0]);$x=$x+60){print substr($F[0],$x,60),"\n";}last;' \ >> source/all.16s.fsa end less source/all.16s.fsa
cat CP000034.codon-usage.aa.list \ | perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \ > aa.dat mysql -B -N -e 'select accession,organism from cmp_genomics.genomes where user=USER()' \ > organism.names foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) cat $accession.codon-usage.aa.list \ |perl -ane 'BEGIN{foreach(`cat organism.names`) \ {chomp;@A=split(/\t+/,$_);$NAMES{$A[0]}=$A[1];}} \ next unless $F[1]=~/^[A-Z]+/;$FREQ{$F[1]}=$F[3];push @AA,$F[1]; \ END{print "\n";print "$NAMES{'$accession'}";foreach $aa(@AA){print "\t",$FREQ{$aa};}}' \ >> aa.dat end
mysql -e "delete from localization where user=USER() and type='signalp'" cmp_genomics foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \ {print "INSERT INTO localization (user,protid,type) \ SELECT USER() AS user, ft.id AS protid, \"signalp\" AS type \ FROM features AS ft \ WHERE ft.user=USER() AND \ ft.accession=\"'$accession'\" AND \ ft.start=$1 and ft.stop=$2 AND \ ft.dir=\"$3\";\n";}' $accession.signalp \ |mysql -D cmp_genomics end
perl -e 'foreach $i(1..11){open IN,"col$i";while(<IN>){chomp;($a,$c)=split(/\t+/,$_);$DATA{$a}{$i}=$c;}close IN;} \ END{foreach $a(reverse sort keys %DATA){foreach $i(1..11){$DATA{$a}{$i}=0 if $DATA{$a}{$i} eq "" and $i>1; \ print $DATA{$a}{$i};print "\t" if $i<11;}print "\n";}}' \ > final.dat R postscript('final.ps') source("/path/to/heatmap.R") data<-as.matrix(read.table('final.dat',row.names=1,header=TRUE,sep="\t")) pal<-maPalette(low=rgb(10,10,10,maxColorValue=10), high=rgb(0,0,10,maxColorValue=10), mid=rgb(5,5,10,maxColorValue=10),, k=1024) par(oma=c(0,0,0,0.5)) data<-scale(data) new_heatmap(data,cexCol=1,cexRow=1,colors=pal) dev.off() quit(save="no")