Difference between revisions of "Pfam"
(→See also) |
|||
(5 intermediate revisions by the same user not shown) | |||
Line 6: | Line 6: | ||
''Note: This is taken directly from the [http://www.sanger.ac.uk/Software/Pfam/help/rdb_doc.shtml Pfam website]. It is archived here in case they take it offline.'' | ''Note: This is taken directly from the [http://www.sanger.ac.uk/Software/Pfam/help/rdb_doc.shtml Pfam website]. It is archived here in case they take it offline.'' | ||
− | Installation | + | See: Installation documentation here: [[MySQL]]. |
===Database structure=== | ===Database structure=== | ||
− | The table structures are in the '.sql' files found in [ftp://ftp.sanger.ac.uk/pub/databases/Pfam/database_files/ .sql] files. | + | The table structures are in the '<code>.sql</code>' files found in [ftp://ftp.sanger.ac.uk/pub/databases/Pfam/database_files/ .sql] files. |
===Table data=== | ===Table data=== | ||
Line 33: | Line 33: | ||
SELECT architecture, pfamseq_id | SELECT architecture, pfamseq_id | ||
− | FROM pfamA,architecture, pfamA_architecture, pfamseq_architecture, pfamseq | + | FROM pfamA,architecture, pfamA_architecture, pfamseq_architecture, pfamseq |
− | WHERE pfamA_id = "B12-binding" | + | WHERE pfamA_id = "B12-binding" |
− | AND pfamA.auto_pfamA = pfamA_architecture.auto_pfamA | + | AND pfamA.auto_pfamA = pfamA_architecture.auto_pfamA |
− | AND pfamA_architecture.auto_architecture = architecture.auto_architecture | + | AND pfamA_architecture.auto_architecture = architecture.auto_architecture |
− | AND architecture.auto_architecture = pfamseq_architecture.auto_architecture | + | AND architecture.auto_architecture = pfamseq_architecture.auto_architecture |
− | AND pfamseq_architecture.auto_pfamseq = pfamseq.auto_pfamseq; | + | AND pfamseq_architecture.auto_pfamseq = pfamseq.auto_pfamseq; |
*View proteins by protein id/accession | *View proteins by protein id/accession | ||
− | QUERY: Give me all the domains for protein PLCG1_BOVIN | + | QUERY: Give me all the domains for protein "<code>PLCG1_BOVIN</code>" |
SEED ALIGNMENT: | SEED ALIGNMENT: | ||
SELECT pfamA_acc, pfamA_id, seq_start, seq_end | SELECT pfamA_acc, pfamA_id, seq_start, seq_end | ||
− | FROM pfamseq, pfamA, pfamA_reg_seed | + | FROM pfamseq, pfamA, pfamA_reg_seed |
− | WHERE pfamseq_id = "pig1_bovin" | + | WHERE pfamseq_id = "pig1_bovin" |
− | AND pfamseq.auto_pfamseq = pfamA_reg_seed.auto_pfamseq | + | AND pfamseq.auto_pfamseq = pfamA_reg_seed.auto_pfamseq |
− | AND pfamA_reg_seed.auto_pfamA = pfamA.auto_pfamA; | + | AND pfamA_reg_seed.auto_pfamA = pfamA.auto_pfamA; |
+-----------+----------+-----------+---------+ | +-----------+----------+-----------+---------+ | ||
| pfamA_acc | pfamA_id | seq_start | seq_end | | | pfamA_acc | pfamA_id | seq_start | seq_end | | ||
Line 59: | Line 59: | ||
3 rows in set (0.03 sec) | 3 rows in set (0.03 sec) | ||
− | FULL ALIGNMENT: This table is different from pfamA_reg_seed as it has the "in_full" column. | + | FULL ALIGNMENT: This table is different from <code>pfamA_reg_seed</code> as it has the "<code>in_full</code>" column. You have to set the <code>in_full = "1"</code> if you want ONLY the domains that are in the full alignment and are significant. Otherwise it will include the insignificant matches and return thousands of rows! |
− | You have to set the in_full = "1" if you want ONLY the domains that are in the full alignment and are significant. | + | |
− | Otherwise it will include the insignificant matches and return thousands of rows! | + | |
SELECT pfamA_acc, pfamA_id, seq_start, seq_end | SELECT pfamA_acc, pfamA_id, seq_start, seq_end | ||
− | FROM pfamseq, pfamA, pfamA_reg_full | + | FROM pfamseq, pfamA, pfamA_reg_full |
− | WHERE pfamseq_id = "PLCG1_BOVIN" | + | WHERE pfamseq_id = "PLCG1_BOVIN" |
− | AND pfamseq.auto_pfamseq = pfamA_reg_full.auto_pfamseq | + | AND pfamseq.auto_pfamseq = pfamA_reg_full.auto_pfamseq |
− | AND pfamA_reg_full.auto_pfamA = pfamA.auto_pfamA | + | AND pfamA_reg_full.auto_pfamA = pfamA.auto_pfamA |
− | AND in_full = "1"; | + | AND in_full = "1"; |
+-----------+----------+-----------+---------+ | +-----------+----------+-----------+---------+ | ||
| pfamA_acc | pfamA_id | seq_start | seq_end | | | pfamA_acc | pfamA_id | seq_start | seq_end | | ||
Line 92: | Line 90: | ||
AND pfamB_reg.auto_pfamB = pfamB.auto_pfamB; | AND pfamB_reg.auto_pfamB = pfamB.auto_pfamB; | ||
− | *Other regions - transmembrane, signal-peptide, coiled-coils | + | *Other regions - transmembrane, signal-peptide, coiled-coils, and low-complexity |
SELECT seq_start, seq_end, type_id, source_id, score | SELECT seq_start, seq_end, type_id, source_id, score | ||
− | FROM other_reg, pfamseq | + | FROM other_reg, pfamseq |
− | WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' | + | WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' |
− | AND other_reg.auto_pfamseq = pfamseq.auto_pfamseq; | + | AND other_reg.auto_pfamseq = pfamseq.auto_pfamseq; |
*Context regions | *Context regions | ||
SELECT seq_start, seq_end, domain_score, pfamA.pfamA_acc, pfamA_id, pfamA.description | SELECT seq_start, seq_end, domain_score, pfamA.pfamA_acc, pfamA_id, pfamA.description | ||
− | FROM context_pfam_regions, pfamseq, pfamA | + | FROM context_pfam_regions, pfamseq, pfamA |
− | WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' | + | WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' |
− | AND context_pfam_regions.auto_pfamseq = pfamseq.auto_pfamseq | + | AND context_pfam_regions.auto_pfamseq = pfamseq.auto_pfamseq |
− | AND pfamA.auto_pfamA = context_pfam_regions.auto_pfamA; | + | AND pfamA.auto_pfamA = context_pfam_regions.auto_pfamA; |
− | + | ||
*Smart regions | *Smart regions | ||
SELECT seq_start, seq_end, smart_id; | SELECT seq_start, seq_end, smart_id; | ||
− | FROM smart_regions, pfamseq, smart | + | FROM smart_regions, pfamseq, smart |
− | WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' | + | WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' |
− | AND smart_regions.auto_pfamseq = pfamseq.auto_pfamseq | + | AND smart_regions.auto_pfamseq = pfamseq.auto_pfamseq |
− | AND smart.auto_smart = smart_regions.auto_smart; | + | AND smart.auto_smart = smart_regions.auto_smart; |
====Query domain by family ID/ACC==== | ====Query domain by family ID/ACC==== | ||
Line 120: | Line 118: | ||
SELECT pfamseq_id, seq_start, seq_end, pfamA_id | SELECT pfamseq_id, seq_start, seq_end, pfamA_id | ||
− | FROM pfamA, pfamseq, pfamA_reg_full | + | FROM pfamA, pfamseq, pfamA_reg_full |
− | WHERE pfamA_id = "B12D" | + | WHERE pfamA_id = "B12D" |
− | AND pfamA.auto_pfamA = pfamA_reg_full.auto_pfamA | + | AND pfamA.auto_pfamA = pfamA_reg_full.auto_pfamA |
− | AND pfamA_reg_full.auto_pfamseq = pfamseq.auto_pfamseq | + | AND pfamA_reg_full.auto_pfamseq = pfamseq.auto_pfamseq |
− | AND in_full = "1"; | + | AND in_full = "1"; |
+------------+-----------+---------+----------+ | +------------+-----------+---------+----------+ | ||
| pfamseq_id | seq_start | seq_end | pfamA_id | | | pfamseq_id | seq_start | seq_end | pfamA_id | | ||
Line 141: | Line 139: | ||
SELECT pfamseq_id, seq_start, seq_end, pfamA_id | SELECT pfamseq_id, seq_start, seq_end, pfamA_id | ||
− | FROM pfamA, pfamseq, pfamA_reg_seed | + | FROM pfamA, pfamseq, pfamA_reg_seed |
− | WHERE pfamA_id = "B12D" | + | WHERE pfamA_id = "B12D" |
− | AND pfamA.auto_pfamA = pfamA_reg_seed.auto_pfamA | + | AND pfamA.auto_pfamA = pfamA_reg_seed.auto_pfamA |
− | AND pfamA_reg_seed.auto_pfamseq = pfamseq.auto_pfamseq; | + | AND pfamA_reg_seed.auto_pfamseq = pfamseq.auto_pfamseq; |
+------------+-----------+---------+----------+ | +------------+-----------+---------+----------+ | ||
| pfamseq_id | seq_start | seq_end | pfamA_id | | | pfamseq_id | seq_start | seq_end | pfamA_id | | ||
Line 161: | Line 159: | ||
SELECT * | SELECT * | ||
− | FROM pfamA , pfamA_web | + | FROM pfamA, pfamA_web |
− | WHERE pfamA_id = "CBS" | + | WHERE pfamA_id = "CBS" |
− | AND pfamA.auto_pfamA = pfamA_web.auto_pfamA; | + | AND pfamA.auto_pfamA = pfamA_web.auto_pfamA; |
*Interpro annotation | *Interpro annotation | ||
SELECT interpro_id, abstract | SELECT interpro_id, abstract | ||
− | FROM interpro AS i, | + | FROM interpro AS i, pfamA AS p |
− | + | WHERE p.auto_pfamA = i.auto_pfamA | |
− | WHERE p.auto_pfamA = i.auto_pfamA | + | AND pfamA_id = "CBS"; |
− | + | ||
*Gene Ontology (GO) annotation | *Gene Ontology (GO) annotation | ||
SELECT go_id, term, category | SELECT go_id, term, category | ||
− | FROM gene_ontology AS go, | + | FROM gene_ontology AS go, pfamA AS p |
− | + | WHERE go.auto_pfamA = p.auto_pfamA | |
− | WHERE go.auto_pfamA = p.auto_pfamA | + | AND pfamA_acc = "PF00067"; |
− | + | ||
*Literature references | *Literature references | ||
− | SELECT pfamA_literature_references.comment, order_added, medline, title, literature_references.author, journal | + | SELECT pfamA_literature_references.comment, order_added, medline, |
+ | title, literature_references.author, journal | ||
FROM pfamA, pfamA_literature_references, literature_references | FROM pfamA, pfamA_literature_references, literature_references | ||
WHERE pfamA_id = "CBS" | WHERE pfamA_id = "CBS" | ||
Line 192: | Line 189: | ||
SELECT db_id, pfamA_database_links.comment, db_link, other_params | SELECT db_id, pfamA_database_links.comment, db_link, other_params | ||
− | FROM pfamA, pfamA_database_links | + | FROM pfamA, pfamA_database_links |
− | WHERE pfamA_id = "CBS" | + | WHERE pfamA_id = "CBS" |
− | AND pfamA.auto_pfamA = pfamA_database_links.auto_pfamA; | + | AND pfamA.auto_pfamA = pfamA_database_links.auto_pfamA; |
====Structures and domain interactions (iPfam)==== | ====Structures and domain interactions (iPfam)==== | ||
Line 200: | Line 197: | ||
*Tables containing PDB information | *Tables containing PDB information | ||
− | There are two tables containing information primary about PDB structures: pdb and msd_data. The pdb table contains a list of PDB identifiers, the header and the title records from that PDB file. | + | There are two tables containing information primary about PDB structures: <code>pdb</code> and <code>msd_data</code>. The <code>pdb</code> table contains a list of PDB identifiers, the header and the title records from that PDB file. |
SELECT pdb_id, header, title FROM pdb WHERE pdb_id="2abl"; | SELECT pdb_id, header, title FROM pdb WHERE pdb_id="2abl"; | ||
Line 210: | Line 207: | ||
1 row in set (0.00 sec) | 1 row in set (0.00 sec) | ||
− | For the PDB information to be useful to Pfam we need to map between PDB residues and UniProt sequence | + | For the PDB information to be useful to Pfam we need to map between PDB residues and UniProt sequence residues. This is not a trivial task! This mapping information is provided by the MSD database. See [http://www.ebi.ac.uk/msd-srv/docs/sifts/ here] for more details. The <code>msd_data</code> table contains this residue by residue mapping. |
− | The following statement gets the first 10 residue mappings for the structure 2ABL. | + | The following statement gets the first 10 residue mappings for the structure <code>2ABL</code>. |
SELECT pdb_id, pdb_res, pdb_seq_number, pfamseq_acc, pfamseq_res, pfamseq_seq_number | SELECT pdb_id, pdb_res, pdb_seq_number, pfamseq_acc, pfamseq_res, pfamseq_seq_number | ||
− | FROM msd_data, pdb, pfamseq | + | FROM msd_data, pdb, pfamseq |
− | WHERE pdb.auto_pdb=msd_data.auto_pdb | + | WHERE pdb.auto_pdb=msd_data.auto_pdb |
− | AND pfamseq.auto_pfamseq=msd_data.auto_pfamseq | + | AND pfamseq.auto_pfamseq=msd_data.auto_pfamseq |
− | AND pdb_id="2abl" | + | AND pdb_id="2abl" |
− | LIMIT 10; | + | LIMIT 10; |
+--------+---------+----------------+-------------+-------------+--------------------+ | +--------+---------+----------------+-------------+-------------+--------------------+ | ||
| pdb_id | pdb_res | pdb_seq_number | pfamseq_acc | pfamseq_res | pfamseq_seq_number | | | pdb_id | pdb_res | pdb_seq_number | pfamseq_acc | pfamseq_res | pfamseq_seq_number | | ||
Line 236: | Line 233: | ||
10 rows in set (0.00 sec) | 10 rows in set (0.00 sec) | ||
− | Using a similar query to the previous one, we generate a mapping for each Pfam domain with a known structure. This information is stored in the pdbmap table. | + | Using a similar query to the previous one, we generate a mapping for each Pfam domain with a known structure. This information is stored in the <code>pdbmap</code> table. |
SELECT pdb_id, chain, pdb_start_res, pdb_end_res | SELECT pdb_id, chain, pdb_start_res, pdb_end_res | ||
− | FROM pdb, pdbmap, pfamA | + | FROM pdb, pdbmap, pfamA |
− | WHERE pfamA_id = 'CBS' | + | WHERE pfamA_id = 'CBS' |
− | AND pfamA.auto_pfamA = pdbmap.auto_pfam | + | AND pfamA.auto_pfamA = pdbmap.auto_pfam |
− | AND pfam_region = '1' | + | AND pfam_region = '1' |
− | AND pdbmap.auto_pdb = pdb.auto_pdb; | + | AND pdbmap.auto_pdb = pdb.auto_pdb; |
− | *Tables Containing Domain Interaction Information | + | *Tables Containing Domain Interaction Information (i.e., iPfam) |
iPfam is a database within a database. iPfam contains information about domain-domain interactions. | iPfam is a database within a database. iPfam contains information about domain-domain interactions. | ||
− | The main table for iPfam is the interaction table. This is a large denormalised table that contains all of the interaction information to the residue-residue level. The interaction table can be joined onto the following tables: pdb, pfamA, pfamseq, int_atom, int_pfamAs | + | The main table for iPfam is the interaction table. This is a large denormalised table that contains all of the interaction information to the residue-residue level. The interaction table can be joined onto the following tables: <code>pdb, pfamA, pfamseq, int_atom, int_pfamAs</code> |
− | SELECT distinct interaction.auto_pfamA_A, interaction.pfamA_id_A, interaction.auto_pfamA_B, interaction.pfamA_id_B | + | SELECT distinct interaction.auto_pfamA_A, interaction.pfamA_id_A, |
− | FROM interaction, pdb | + | interaction.auto_pfamA_B, interaction.pfamA_id_B |
− | WHERE pdb_id="2abl" | + | FROM interaction, pdb |
− | AND interaction.auto_pdb=pdb.auto_pdb; | + | WHERE pdb_id="2abl" |
+ | AND interaction.auto_pdb=pdb.auto_pdb; | ||
− | There are three other tables that are part of the iPfam specific tables. The int_atom contains the in atom numbers that are forming the interaction. The int_bond table contains the actual bond that is formed between the interaction. | + | There are three other tables that are part of the iPfam specific tables. The <code>int_atom</code> contains the in atom numbers that are forming the interaction. The <code>int_bond</code> table contains the actual bond that is formed between the interaction. |
− | SELECT pdb_id, interaction.pdb_seq_number_A, pfamseq_seq_number_B, int_atom.pdb_atom, int_atom.partner_pdb_atom | + | SELECT pdb_id, interaction.pdb_seq_number_A, pfamseq_seq_number_B, |
− | FROM interaction, pdb, int_atom | + | int_atom.pdb_atom, int_atom.partner_pdb_atom |
− | WHERE pdb_id="2abl" | + | FROM interaction, pdb, int_atom |
− | AND int_atom.auto_atom_int=interaction.auto_atom_int | + | WHERE pdb_id="2abl" |
− | AND interaction.auto_pdb=pdb.auto_pdb | + | AND int_atom.auto_atom_int=interaction.auto_atom_int |
− | LIMIT 10; | + | AND interaction.auto_pdb=pdb.auto_pdb |
+ | LIMIT 10; | ||
− | Finally, the int_pfamAs table gives the listing of the Domain-Domain interactions found in iPfam. | + | Finally, the <code>int_pfamAs</code> table gives the listing of the Domain-Domain interactions found in iPfam. |
====Genomes==== | ====Genomes==== | ||
Line 274: | Line 273: | ||
SELECT ncbi_code, species, num_distinct_regions, num_total_regions, num_proteins, | SELECT ncbi_code, species, num_distinct_regions, num_total_regions, num_proteins, | ||
sequence_coverage, residue_coverage, total_genome_proteins | sequence_coverage, residue_coverage, total_genome_proteins | ||
− | FROM genome_species | + | FROM genome_species |
− | WHERE grouping like '%Bacteria%' | + | WHERE grouping like '%Bacteria%' |
ORDER BY species; | ORDER BY species; | ||
Line 281: | Line 280: | ||
SELECT genome_seqs.auto_pfamA, pfamA_acc, pfamA_id, description, sum(count) | SELECT genome_seqs.auto_pfamA, pfamA_acc, pfamA_id, description, sum(count) | ||
− | FROM genome_seqs, pfamA | + | FROM genome_seqs, pfamA |
− | WHERE genome_seqs.ncbi_code = '1423' | + | WHERE genome_seqs.ncbi_code = '1423' |
− | AND genome_seqs.auto_pfamA = pfamA.auto_pfamA | + | AND genome_seqs.auto_pfamA = pfamA.auto_pfamA |
GROUP BY genome_seqs.auto_pfamA; | GROUP BY genome_seqs.auto_pfamA; | ||
Line 289: | Line 288: | ||
SELECT pfamseq.pfamseq_id | SELECT pfamseq.pfamseq_id | ||
− | FROM pfamseq, genome_seqs | + | FROM pfamseq, genome_seqs |
− | WHERE ncbi_code = '1423' | + | WHERE ncbi_code = '1423' |
− | AND genome_seqs.auto_pfamseq = pfamseq.auto_pfamseq; | + | AND genome_seqs.auto_pfamseq = pfamseq.auto_pfamseq; |
*Return all the protein sequences for a species and a specific Pfam-A domain | *Return all the protein sequences for a species and a specific Pfam-A domain | ||
SELECT pfamseq.pfamseq_id | SELECT pfamseq.pfamseq_id | ||
− | FROM pfamseq, genome_seqs, pfamA | + | FROM pfamseq, genome_seqs, pfamA |
− | WHERE ncbi_code = '1423' | + | WHERE ncbi_code = '1423' |
− | AND genome_seqs.auto_pfamseq = pfamseq.auto_pfamseq | + | AND genome_seqs.auto_pfamseq = pfamseq.auto_pfamseq |
− | AND genome_seqs.auto_pfamA = pfamA.auto_pfamA | + | AND genome_seqs.auto_pfamA = pfamA.auto_pfamA |
− | AND pfamA_acc = 'PF00106'; | + | AND pfamA_acc = 'PF00106'; |
==See also== | ==See also== | ||
*[http://www.ebi.ac.uk/integr8/EBI-Integr8-HomePage.do Integr8] — Access to complete genomes and proteomes | *[http://www.ebi.ac.uk/integr8/EBI-Integr8-HomePage.do Integr8] — Access to complete genomes and proteomes | ||
− | *[[TrEMBL]] Database performing an automated protein sequence annotation | + | *[[TrEMBL]] — Database performing an automated protein sequence annotation |
− | *[[InterPro]] Integration of protein domain and protein family databases | + | *[[InterPro]] — Integration of protein domain and protein family databases |
==References== | ==References== |
Latest revision as of 08:23, 8 September 2007
The Pfam database contains information about protein domains and families. Pfam-A is the manually curated portion of the database that contains over 8,000 entries. For each entry a protein sequence alignment and a hidden Markov model is stored. These hidden Markov models can be used to search sequence databases with the HMMer package written by Sean Eddy. Because the entries in Pfam-A do not cover all known proteins an automatically generated supplement is provided called Pfam-B. Pfam-B is derived from the PRODOM database.
The database iPfam builds on the domain description of Pfam. It investigates if different proteins described together in the protein structure database PDB are close enough to potentially interact.
Pfam MySQL database documentation
Note: This is taken directly from the Pfam website. It is archived here in case they take it offline.
See: Installation documentation here: MySQL.
Database structure
The table structures are in the '.sql
' files found in .sql files.
Table data
The table data is in the form: "THE_TABLE_NAME".sql.gz
(i.e., pfamA table data is in pfamA.sql.gz
)
The files can be downloaded from ftp://ftp.sanger.ac.uk/pub/databases/Pfam/database_files/
- Upload files
- gunzip files
- Load into MySQL using:
mysql> load data infile 'FULL_PATH_DIR/pfamA.sql' into table pfamA;
Table queries
Query domains by architecture and protein ID/ACC
- View proteins by Architecture
QUERY: Give me all the architectures and protein sequences for B12-binding family.
The architecture information is in 3 tables:
architecture pfamA_architecture pfamseq_architecture
SELECT architecture, pfamseq_id FROM pfamA,architecture, pfamA_architecture, pfamseq_architecture, pfamseq WHERE pfamA_id = "B12-binding" AND pfamA.auto_pfamA = pfamA_architecture.auto_pfamA AND pfamA_architecture.auto_architecture = architecture.auto_architecture AND architecture.auto_architecture = pfamseq_architecture.auto_architecture AND pfamseq_architecture.auto_pfamseq = pfamseq.auto_pfamseq;
- View proteins by protein id/accession
QUERY: Give me all the domains for protein "PLCG1_BOVIN
"
SEED ALIGNMENT:
SELECT pfamA_acc, pfamA_id, seq_start, seq_end FROM pfamseq, pfamA, pfamA_reg_seed WHERE pfamseq_id = "pig1_bovin" AND pfamseq.auto_pfamseq = pfamA_reg_seed.auto_pfamseq AND pfamA_reg_seed.auto_pfamA = pfamA.auto_pfamA; +-----------+----------+-----------+---------+ | pfamA_acc | pfamA_id | seq_start | seq_end | +-----------+----------+-----------+---------+ | PF00168 | C2 | 1090 | 1177 | | PF00388 | PI-PLC-X | 321 | 465 | | PF00018 | SH3_1 | 794 | 849 | +-----------+----------+-----------+---------+ 3 rows in set (0.03 sec)
FULL ALIGNMENT: This table is different from pfamA_reg_seed
as it has the "in_full
" column. You have to set the in_full = "1"
if you want ONLY the domains that are in the full alignment and are significant. Otherwise it will include the insignificant matches and return thousands of rows!
SELECT pfamA_acc, pfamA_id, seq_start, seq_end FROM pfamseq, pfamA, pfamA_reg_full WHERE pfamseq_id = "PLCG1_BOVIN" AND pfamseq.auto_pfamseq = pfamA_reg_full.auto_pfamseq AND pfamA_reg_full.auto_pfamA = pfamA.auto_pfamA AND in_full = "1"; +-----------+----------+-----------+---------+ | pfamA_acc | pfamA_id | seq_start | seq_end | +-----------+----------+-----------+---------+ | PF00168 | C2 | 1090 | 1177 | | PF00017 | SH2 | 550 | 639 | | PF00017 | SH2 | 668 | 741 | | PF00388 | PI-PLC-X | 321 | 465 | | PF00018 | SH3_1 | 794 | 849 | | PF00169 | PH | 33 | 142 | | PF00387 | PI-PLC-Y | 952 | 1070 | +-----------+----------+-----------+---------+ 7 rows in set (0.00 sec)
- View other regions by protein id/accession
pfam-B regions
SELECT distinct seq_start, seq_end, pfamB.pfamB_acc, pfamB_id FROM pfamB_reg, pfamB, pfamseq WHERE pfamseq_id = 'PLCG1_BOVIN' AND pfamB_reg.auto_pfamseq = pfamseq.auto_pfamseq AND pfamB_reg.auto_pfamB = pfamB.auto_pfamB;
- Other regions - transmembrane, signal-peptide, coiled-coils, and low-complexity
SELECT seq_start, seq_end, type_id, source_id, score FROM other_reg, pfamseq WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' AND other_reg.auto_pfamseq = pfamseq.auto_pfamseq;
- Context regions
SELECT seq_start, seq_end, domain_score, pfamA.pfamA_acc, pfamA_id, pfamA.description FROM context_pfam_regions, pfamseq, pfamA WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' AND context_pfam_regions.auto_pfamseq = pfamseq.auto_pfamseq AND pfamA.auto_pfamA = context_pfam_regions.auto_pfamA;
- Smart regions
SELECT seq_start, seq_end, smart_id; FROM smart_regions, pfamseq, smart WHERE pfamseq.pfamseq_id = 'PLCG1_BOVIN' AND smart_regions.auto_pfamseq = pfamseq.auto_pfamseq AND smart.auto_smart = smart_regions.auto_smart;
Query domain by family ID/ACC
- All domains for a family (FULL alignment)
SELECT pfamseq_id, seq_start, seq_end, pfamA_id FROM pfamA, pfamseq, pfamA_reg_full WHERE pfamA_id = "B12D" AND pfamA.auto_pfamA = pfamA_reg_full.auto_pfamA AND pfamA_reg_full.auto_pfamseq = pfamseq.auto_pfamseq AND in_full = "1"; +------------+-----------+---------+----------+ | pfamseq_id | seq_start | seq_end | pfamA_id | +------------+-----------+---------+----------+ | Q42338 | 2 | 88 | B12D | | Q9LJ47 | 1 | 87 | B12D | | Q9XHD5 | 3 | 89 | B12D | | O22414 | 3 | 89 | B12D | | Q940E1 | 29 | 116 | B12D | | Q40019 | 2 | 87 | B12D | | Q84MX3 | 87 | 173 | B12D | +------------+-----------+---------+----------+ 7 rows in set (0.02 sec)
- All domains for a family (SEED alignment)
SELECT pfamseq_id, seq_start, seq_end, pfamA_id FROM pfamA, pfamseq, pfamA_reg_seed WHERE pfamA_id = "B12D" AND pfamA.auto_pfamA = pfamA_reg_seed.auto_pfamA AND pfamA_reg_seed.auto_pfamseq = pfamseq.auto_pfamseq; +------------+-----------+---------+----------+ | pfamseq_id | seq_start | seq_end | pfamA_id | +------------+-----------+---------+----------+ | Q42338 | 2 | 88 | B12D | | O22414 | 3 | 89 | B12D | | Q9XHD5 | 3 | 89 | B12D | | Q940E1 | 29 | 116 | B12D | | Q9LJ47 | 1 | 87 | B12D | +------------+-----------+---------+----------+ 5 rows in set (0.00 sec)
Annotation: Information displayed on family page
- Pfam annotation
SELECT * FROM pfamA, pfamA_web WHERE pfamA_id = "CBS" AND pfamA.auto_pfamA = pfamA_web.auto_pfamA;
- Interpro annotation
SELECT interpro_id, abstract FROM interpro AS i, pfamA AS p WHERE p.auto_pfamA = i.auto_pfamA AND pfamA_id = "CBS";
- Gene Ontology (GO) annotation
SELECT go_id, term, category FROM gene_ontology AS go, pfamA AS p WHERE go.auto_pfamA = p.auto_pfamA AND pfamA_acc = "PF00067";
- Literature references
SELECT pfamA_literature_references.comment, order_added, medline, title, literature_references.author, journal FROM pfamA, pfamA_literature_references, literature_references WHERE pfamA_id = "CBS" AND pfamA.auto_pfamA = pfamA_literature_references.auto_pfamA AND pfamA_literature_references.auto_lit = literature_references.auto_lit;
- Database References
SELECT db_id, pfamA_database_links.comment, db_link, other_params FROM pfamA, pfamA_database_links WHERE pfamA_id = "CBS" AND pfamA.auto_pfamA = pfamA_database_links.auto_pfamA;
Structures and domain interactions (iPfam)
- Tables containing PDB information
There are two tables containing information primary about PDB structures: pdb
and msd_data
. The pdb
table contains a list of PDB identifiers, the header and the title records from that PDB file.
SELECT pdb_id, header, title FROM pdb WHERE pdb_id="2abl"; +--------+--------------+-----------------------------------------------------------+ | pdb_id | header | title | +--------+--------------+-----------------------------------------------------------+ | 2abl | Transferase | Sh3-sh2 domain fragment of human bcr-abl tyrosine kinase | +--------+--------------+-----------------------------------------------------------+ 1 row in set (0.00 sec)
For the PDB information to be useful to Pfam we need to map between PDB residues and UniProt sequence residues. This is not a trivial task! This mapping information is provided by the MSD database. See here for more details. The msd_data
table contains this residue by residue mapping.
The following statement gets the first 10 residue mappings for the structure 2ABL
.
SELECT pdb_id, pdb_res, pdb_seq_number, pfamseq_acc, pfamseq_res, pfamseq_seq_number FROM msd_data, pdb, pfamseq WHERE pdb.auto_pdb=msd_data.auto_pdb AND pfamseq.auto_pfamseq=msd_data.auto_pfamseq AND pdb_id="2abl" LIMIT 10; +--------+---------+----------------+-------------+-------------+--------------------+ | pdb_id | pdb_res | pdb_seq_number | pfamseq_acc | pfamseq_res | pfamseq_seq_number | +--------+---------+----------------+-------------+-------------+--------------------+ | 2abl | MET | 75 | P00519 | A | 56 | | 2abl | GLY | 76 | P00519 | G | 57 | | 2abl | PRO | 77 | P00519 | P | 58 | | 2abl | SER | 78 | P00519 | S | 59 | | 2abl | GLU | 79 | P00519 | E | 60 | | 2abl | ASN | 80 | P00519 | N | 61 | | 2abl | ASP | 81 | P00519 | D | 62 | | 2abl | PRO | 82 | P00519 | P | 63 | | 2abl | ASN | 83 | P00519 | N | 64 | | 2abl | LEU | 84 | P00519 | L | 65 | +--------+---------+----------------+-------------+-------------+--------------------+ 10 rows in set (0.00 sec)
Using a similar query to the previous one, we generate a mapping for each Pfam domain with a known structure. This information is stored in the pdbmap
table.
SELECT pdb_id, chain, pdb_start_res, pdb_end_res FROM pdb, pdbmap, pfamA WHERE pfamA_id = 'CBS' AND pfamA.auto_pfamA = pdbmap.auto_pfam AND pfam_region = '1' AND pdbmap.auto_pdb = pdb.auto_pdb;
- Tables Containing Domain Interaction Information (i.e., iPfam)
iPfam is a database within a database. iPfam contains information about domain-domain interactions.
The main table for iPfam is the interaction table. This is a large denormalised table that contains all of the interaction information to the residue-residue level. The interaction table can be joined onto the following tables: pdb, pfamA, pfamseq, int_atom, int_pfamAs
SELECT distinct interaction.auto_pfamA_A, interaction.pfamA_id_A, interaction.auto_pfamA_B, interaction.pfamA_id_B FROM interaction, pdb WHERE pdb_id="2abl" AND interaction.auto_pdb=pdb.auto_pdb;
There are three other tables that are part of the iPfam specific tables. The int_atom
contains the in atom numbers that are forming the interaction. The int_bond
table contains the actual bond that is formed between the interaction.
SELECT pdb_id, interaction.pdb_seq_number_A, pfamseq_seq_number_B, int_atom.pdb_atom, int_atom.partner_pdb_atom FROM interaction, pdb, int_atom WHERE pdb_id="2abl" AND int_atom.auto_atom_int=interaction.auto_atom_int AND interaction.auto_pdb=pdb.auto_pdb LIMIT 10;
Finally, the int_pfamAs
table gives the listing of the Domain-Domain interactions found in iPfam.
Genomes
Note: ncbi_code 1423
is for species: Bacillus subtilis. This information if found in the ncbi_taxonomy
table.
- Return all the species and basic Pfam information for a Kingdom:
SELECT ncbi_code, species, num_distinct_regions, num_total_regions, num_proteins, sequence_coverage, residue_coverage, total_genome_proteins FROM genome_species WHERE grouping like '%Bacteria%' ORDER BY species;
- Return all the Pfam-A domains for a species (using ncbi codes)
SELECT genome_seqs.auto_pfamA, pfamA_acc, pfamA_id, description, sum(count) FROM genome_seqs, pfamA WHERE genome_seqs.ncbi_code = '1423' AND genome_seqs.auto_pfamA = pfamA.auto_pfamA GROUP BY genome_seqs.auto_pfamA;
- Return the protein sequences for a species
SELECT pfamseq.pfamseq_id FROM pfamseq, genome_seqs WHERE ncbi_code = '1423' AND genome_seqs.auto_pfamseq = pfamseq.auto_pfamseq;
- Return all the protein sequences for a species and a specific Pfam-A domain
SELECT pfamseq.pfamseq_id FROM pfamseq, genome_seqs, pfamA WHERE ncbi_code = '1423' AND genome_seqs.auto_pfamseq = pfamseq.auto_pfamseq AND genome_seqs.auto_pfamA = pfamA.auto_pfamA AND pfamA_acc = 'PF00106';
See also
- Integr8 — Access to complete genomes and proteomes
- TrEMBL — Database performing an automated protein sequence annotation
- InterPro — Integration of protein domain and protein family databases
References
- Finn RD, Mistry J, Schuster-Bockler B, Griffiths-Jones S, Hollich V, Lassmann T, Moxon S, Marshall M, Khanna A, Durbin R, Eddy SR, Sonnhammer EL, Bateman A (2006). "Pfam: clans, web tools and services". Nucleic Acids Res, 34:D247-D251; PMID 16381856.
- Finn RD, Marshall M, Bateman A (2005). "iPfam: visualization of protein-protein interactions in PDB at domain and amino acid resolutions". Bioinformatics, 21:410-412; PMID 15353450.
- Bateman A, Coin L, Durbin R, Finn RD, Hollich V, Griffiths-Jones S, Khanna A, Marshall M, Moxon S, Sonnhammer EL, Studholme DJ, Yeats C, Eddy SR (2004). "The Pfam protein families database". Nucleic Acids Res, 32(Database issue):D138-D141; PMID 14681378.
External links
- Pfam - Protein family database at Sanger Institute, UK
- Pfam - Protein family database at Janelia Farm Research Campus, USA
- Pfam - Protein family database at Center for Genomics and Bioinformatics, Sweden
- iPfam - Interactions of Pfam domains in PDB
- Pfam database files (ftp)
- Pfam MySQL database documentation