Difference between revisions of "Talk:Perl"
From Christoph's Personal Wiki
Line 114: | Line 114: | ||
if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1 | if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1 | ||
</pre> | </pre> | ||
+ | |||
+ | ==Misc== | ||
+ | *Change columns in each line of files: | ||
+ | perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab | ||
+ | |||
+ | *Change tab-separated data to use a different delimiter: | ||
+ | perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv | ||
+ | |||
+ | *Remove spaces in a line: | ||
+ | perl -e 'while(<>){s/ //g;print $_;}' foo | ||
+ | |||
+ | *Change all characters to upper case: | ||
+ | perl -e 'while(<>){print uc($_);}' foo | ||
+ | |||
+ | *Change a FASTA file into tabular format: | ||
+ | perl -e '$count=0; $len=0; while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";}else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab | ||
+ | |||
+ | *Change a tabular file into FASTA format: | ||
+ | perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]";if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s);$s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa | ||
+ | |||
+ | *Change from one biological format to another: | ||
+ | perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0;for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat);$out=Bio::SeqIO->newFh(-format =>$outformat);while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa | ||
+ | |||
+ | ==External links== | ||
+ | *[http://sysbio.harvard.edu/csb/resources/computational/scriptome/UNIX/Tools/Change.html] |
Revision as of 00:46, 27 April 2007
Perl and MySQL
This section will just list a bunch of random examples. They are useful to give you an idea of what you can do with Perl, MySQL, and the CLI.
Note: These examples were taken from my course in Comparative Microbial Genomics at CBS (in Denmark).
mysql -B -e "update cmp_genomics.features set note = '' where user = USER() and note not like 'tcs%' or note is null" foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) foreach type (ecf s54 s70) cat source/$accession.proteins.$type.sigmas.hmmsearch | \ perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ my ($start,$stop,$dir,$score,$evalue) = ($1,$2,$3,$4,$5);\ next unless $score > 0;\ print "update cmp_genomics.features set note = \"Sigma Factor '$type'\" \ where start=$start and stop = $stop and user = user() and accession = \"'$accession'\";\n";'\ | mysql end end
mysql -B -e "update cmp_genomics.features set note = '' where user = USER() and note not like 'sigma%' or note is null" foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) foreach type (RRreciever HisKA_1 HisKA_2 HisKA_3 HWE_HK) cat source/$accession.$type.TCS.hmmsearch | \ perl -ne 'next unless /^CDS_(\d+)\-(\d+)_DIR([\-\+]+)\s+([0-9\-\.e]+)\s+([0-9\-\.e]+)\s+/;\ my ($start,$stop,$dir,$score,$evalue) = ($1,$2,$3,$4,$5);\ next unless $score > 0; \ print "update cmp_genomics.features set note = \"TCS '$type'\" \ where start=$start and stop = $stop and user = user() and accession = \"'$accession'\";\n";'\ | mysql -B end end
rm -f source/all.16s.fsa foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) mysql -B -N -D cmp_genomics -e "select substr(seq,start,stop-start+1) as sequence,\ replace(substring_index(genomes.organism,' ',4),' ','_') as name,dir from genomes,features where \ genomes.accession = features.accession and genomes.user = features.user and \ features.user = user() and featuretype='rRNA' and features.accession = '$accession'" | \ perl -ane '$F[0] = reverse ( $F[0] ) if $F[2] eq "-";$F[0] =~ tr/ATGC/TACG/ if $F[2] eq "-";\ print ">$F[1]\n"; \ for ( $x = 0 ; $x < length ( $F[0] ) ; $x = $x+60) { print substr ( $F[0] , $x , 60),"\n"; } last;' \ >> source/all.16s.fsa end less source/all.16s.fsa
cat ~/Ex3/source/CP000034.codon-usage.aa.list \ | perl -ane 'next unless $F[1] =~ /^[A-Z]+/;print "\t$F[1]"; ' \ > aa.dat mysql -B -N -e 'select accession,organism from cmp_genomics.genomes where user=USER()' \ > organism.names foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) cat ~/Ex3/source/$accession.codon-usage.aa.list \ | perl -ane 'BEGIN { foreach (`cat organism.names`) \ {chomp; @A = split (/\t+/,$_);$NAMES{$A[0]} = $A[1];} } \ next unless $F[1] =~ /^[A-Z]+/;$FREQ{$F[1]} = $F[3]; push @AA,$F[1]; \ END { print "\n"; print "$NAMES{'$accession'}"; foreach $aa (@AA){print "\t",$FREQ{$aa};}}' \ >> aa.dat end
mysql -e "delete from localization where user=USER() and type='signalp'" cmp_genomics foreach accession (AE017042 AE016879 AL111168 AL645882 AP008232 AP009048 BA000021 CP000034) perl -ne 'if(/^CDS_(\d+)-(\d+)_DIR([+-])\s+0.\d+\s+\d+\s+[YN]\s+0.\d+\s+\d+\s+[YN]\s+[10].\d+\s+\d+\s+Y\s+[10].\d+\s+Y\s+[10].\s+\s+Y\s+CDS_\d+-\d+_DIR[+-]\s+Q\s+[10].\d+\s+[10]\s+Y\s+[10].\d+\s+Y/) \ {print "insert into localization (user, protid, type) \ select USER() as user, ft.id as protid, \"signalp\" as type \ from features as ft \ where ft.user=USER() and \ ft.accession = \"'$accession'\" and \ ft.start = $1 and ft.stop = $2 and \ ft.dir = \"$3\";\n";}' $accession.signalp \ | mysql -D cmp_genomics end
perl -e 'foreach $i (1..11) { open IN , "col$i"; while (<IN>) { chomp;($a , $c) = split (/\t+/,$_); $DATA{$a}{$i} = $c;} close IN;} \ END {foreach $a ( reverse sort keys %DATA) {foreach $i (1..11) {$DATA{$a}{$i} = 0 if $DATA{$a}{$i} eq "" and $i > 1; \ print $DATA{$a}{$i}; print "\t" if $i < 11;}print "\n";}}' \ > final.dat R postscript ('final.ps') source ( "/home/people/kiil/Exercises/files/heatmap.R" ) data <- as.matrix ( read.table ( 'final.dat' , row.names=1, header= TRUE,sep = "\t" ) ) pal <- maPalette(low = rgb(10,10,10,maxColorValue = 10), high = rgb(0,0,10,maxColorValue = 10), mid= rgb(5,5,10,maxColorValue = 10),, k = 1024) par(oma=c(0, 0, 0, 0.5)) data <- scale(data) new_heatmap ( data , cexCol=1,cexRow=1,colors = pal) dev.off() quit(save="no")
Transpose matrix
Note: The following could be used to transpose a CSV file.
#!/bin/sh # Example CSV file # FROM: # a1,a2,a3 # b1,b2,b3 # TO: # a1,b1 # a2,b2 # a3,b3 perl -e '$unequal=0;$_=<>;s/\r?\n//;@out_rows=split(/\,/, $_);\ $num_out_rows=$#out_rows+1;while(<>){s/\r?\n//;@F=split(/\,/,$_);\ foreach $i (0 .. $#F){$out_rows[$i] .="\,$F[$i]";}\ if($num_out_rows !=$#F+1){$unequal=1;}} END\ {foreach $row (@out_rows){print"$row\n"} \ warn "\nWARNING! Rows in input had different numbers of columns\n" \ if $unequal;warn "\nTransposed table: result has $. columns and $num_out_rows rows\n\n" }' $1
Misc
- Change columns in each line of files:
perl -e '$sep=",";while(<>){s/\Q$sep\E/\t/g;print $_;}' file.csv >file.tab
- Change tab-separated data to use a different delimiter:
perl -e '$sep=",";while(<>){s/\t/$sep/g;print $_;}' file.tab >file.csv
- Remove spaces in a line:
perl -e 'while(<>){s/ //g;print $_;}' foo
- Change all characters to upper case:
perl -e 'while(<>){print uc($_);}' foo
- Change a FASTA file into tabular format:
perl -e '$count=0; $len=0; while(<>){s/\r?\n//;s/\t/ /g;if(s/^>//){if($. !=1){print "\n"}s/ |$/\t/;$count++;$_ .="\t";}else{s/ //g;$len +=length($_)}print $_;}print "\n";' foo.fsa >foo.tab
- Change a tabular file into FASTA format:
perl -e '$len=0;while(<>){s/\r?\n//;@F=split /\t/,$_;print ">$F[0]";if(length($F[1])){print" $F[1]"}print "\n";$s=$F[2];$len+=length($s);$s=~s/.{60}(?=.)/$&\n/g;print "$s\n";}' foo.tab >foo.fsa
- Change from one biological format to another:
perl -MBio::SeqIO -e '$informat="genbank";$outformat="fasta";$count=0;for $infile (@ARGV){$in=Bio::SeqIO->newFh(-file =>$infile, -format =>$informat);$out=Bio::SeqIO->newFh(-format =>$outformat);while(<$in>){print $out $_;$count++;}}' foo.gbk >foo.fsa