#!/usr/local/bin/perl ############################################################################### # Program : GetDomainHits # Author : Eric Deutsch # $Id$ # # Description : This program that allows users to # browse through data in the Annotated Peptide Database. # # SBEAMS is Copyright (C) 2000-2005 Institute for Systems Biology # This program is governed by the terms of the GNU General Public License (GPL) # version 2 as published by the Free Software Foundation. It is provided # WITHOUT ANY WARRANTY. See the full description of GPL terms in the # LICENSE file distributed with this software. # ############################################################################### ############################################################################### # Set up all needed modules and objects ############################################################################### use strict; use Getopt::Long; use FindBin; use lib "$FindBin::Bin/../../lib/perl"; use vars qw ($sbeams $sbeamsMOD $q $current_contact_id $current_username $PROG_NAME $USAGE %OPTIONS $QUIET $VERBOSE $DEBUG $DATABASE $TABLE_NAME $PROGRAM_FILE_NAME $CATEGORY $DB_TABLE_NAME @MENU_OPTIONS); use SBEAMS::Connection qw($q); use SBEAMS::Connection::Settings; use SBEAMS::Connection::Tables; use SBEAMS::ProteinStructure; use SBEAMS::ProteinStructure::Settings; use SBEAMS::ProteinStructure::Tables; $sbeams = new SBEAMS::Connection; $sbeamsMOD = new SBEAMS::ProteinStructure; $sbeamsMOD->setSBEAMS($sbeams); $sbeams->setSBEAMS_SUBDIR($SBEAMS_SUBDIR); #use CGI; use CGI::Carp qw(fatalsToBrowser croak); #$q = new CGI; ############################################################################### # Set program name and usage banner for command like use ############################################################################### $PROG_NAME = $FindBin::Script; $USAGE = <Authenticate() and exit if it fails or continue if it works. ############################################################################### sub main { #### Do the SBEAMS authentication and exit if a username is not returned exit unless ($current_username = $sbeams->Authenticate( permitted_work_groups_ref=>['ProteinStructure_user', 'ProteinStructure_admin','ProteinStructure_readonly','Admin'], #connect_read_only=>1, #allow_anonymous_access=>1, )); #### Read in the default input parameters my %parameters; my $n_params_found = $sbeams->parse_input_parameters( q=>$q,parameters_ref=>\%parameters); #$sbeams->printDebuggingInfo($q); #### Process generic "state" parameters before we start $sbeams->processStandardParameters(parameters_ref=>\%parameters); #### Decide what action to take based on information so far if ($parameters{action} eq "???") { # Some action } else { $sbeamsMOD->display_page_header( onload => 'hide_processing_info()' ); print_hide_script() if $sbeams->output_mode() eq 'html'; handle_request(ref_parameters=>\%parameters); $sbeamsMOD->display_page_footer(); } } # end main sub print_hide_script { # Some page-specific javascript/css. Allows loading info to get hidden, # draws box around legend print <<" END"; END } ############################################################################### # Handle Request ############################################################################### sub handle_request { my %args = @_; #### Process the arguments list my $ref_parameters = $args{'ref_parameters'} || die "ref_parameters not passed"; my %parameters = %{$ref_parameters}; #### Define some generic varibles my ($i,$element,$key,$value,$line,$result,$sql); #### Define some variables for a query and resultset my %resultset = (); my $resultset_ref = \%resultset; my (%url_cols,%hidden_cols,%max_widths,$show_sql); #### Read in the standard form values my $apply_action = $parameters{'action'} || $parameters{'apply_action'}; my $TABLE_NAME = $parameters{'QUERY_NAME'}; #### Set some specific settings for this program my $CATEGORY="Get Domain Hits"; $TABLE_NAME="PS_GetDomainHit" unless ($TABLE_NAME); ($PROGRAM_FILE_NAME) = $sbeamsMOD->returnTableInfo($TABLE_NAME,"PROGRAM_FILE_NAME"); my $base_url = "$CGI_BASE_DIR/$SBEAMS_SUBDIR/$PROGRAM_FILE_NAME"; #### Get the columns and input types for this table/query my @columns = $sbeamsMOD->returnTableInfo($TABLE_NAME,"ordered_columns"); my %input_types = $sbeamsMOD->returnTableInfo($TABLE_NAME,"input_types"); #### Read the input parameters for each column my $n_params_found = $sbeams->parse_input_parameters( q=>$q,parameters_ref=>\%parameters, columns_ref=>\@columns,input_types_ref=>\%input_types); #### If the apply action was to recall a previous resultset, do it my %rs_params = $sbeams->parseResultSetParams(q=>$q); if ($apply_action eq "VIEWRESULTSET") { $sbeams->readResultSet(resultset_file=>$rs_params{set_name}, resultset_ref=>$resultset_ref,query_parameters_ref=>\%parameters); $n_params_found = 99; } #### Set some reasonable defaults if no parameters supplied unless ($n_params_found) { $parameters{input_form_format} = "minimum_detail"; $parameters{display_options} = "ApplyChilliFilter,ShowExtraProteinProps"; } #### Apply any parameter adjustment logic unless (defined($parameters{project_id}) && $parameters{project_id} gt '') { $parameters{project_id} = $sbeams->getCurrent_project_id(); } #### Display the user-interaction input form $sbeams->display_input_form( TABLE_NAME=>$TABLE_NAME,CATEGORY=>$CATEGORY,apply_action=>$apply_action, PROGRAM_FILE_NAME=>$PROGRAM_FILE_NAME, parameters_ref=>\%parameters, input_types_ref=>\%input_types, ); #### Display the form action buttons $sbeams->display_form_buttons(TABLE_NAME=>$TABLE_NAME); #### Finish the upper part of the page and go begin the full-width #### data portion of the page # $sbeams->display_page_footer(close_tables=>'YES', # separator_bar=>'YES',display_footer=>'NO'); ######################################################################### #### Process all the constraints #### Build BIOSEQUENCE_SET constraint my $form_test = $sbeams->parseConstraint2SQL( constraint_column=>"BS.biosequence_set_id", constraint_type=>"int_list", constraint_name=>"BioSequence Set", constraint_value=>$parameters{biosequence_set_id} ); return if ($form_test eq '-1'); #### Verify that the selected biosequence_sets are permitted if ($parameters{biosequence_set_id}) { my $sql = qq~ SELECT biosequence_set_id,project_id FROM $TBPS_BIOSEQUENCE_SET WHERE biosequence_set_id IN ( $parameters{biosequence_set_id} ) AND record_status != 'D' ~; my %project_ids = $sbeams->selectTwoColumnHash($sql); my @accessible_project_ids = $sbeams->getAccessibleProjects(); my %accessible_project_ids; foreach my $id ( @accessible_project_ids ) { $accessible_project_ids{$id} = 1; } my @input_ids = split(',',$parameters{biosequence_set_id}); my @verified_ids; foreach my $id ( @input_ids ) { #### If the requested biosequence_set_id doesn't exist if (! defined($project_ids{$id})) { $sbeams->reportException( state => 'ERROR', type => 'BAD CONSTRAINT', message => "Non-existent biosequence_set_id = $id specified", ); #### If the project for this biosequence_set is not accessible } elsif (! defined($accessible_project_ids{$project_ids{$id}})) { $sbeams->reportException( state => 'ERROR', type => 'PERMISSION DENIED', message => "Your current privilege settings do not allow you to access biosequence_set_id = $id. See project owner to gain permission.", ); #### Else, let it through } else { push(@verified_ids,$id); } } #### Set the input constraint to only allow that which is valid $parameters{biosequence_set_id} = join(',',@verified_ids); } #### If no valid biosequence_set_id was selected, stop here unless ($parameters{biosequence_set_id}) { $sbeams->reportException( state => 'ERROR', type => 'INSUFFICIENT CONSTRAINTS', message => "You must select at least one valid Biosequence Set", ); return; } #### Build BIOSEQUENCE_SET constraint my $biosequence_set_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BS.biosequence_set_id", constraint_type=>"int_list", constraint_name=>"BioSequence Set", constraint_value=>$parameters{biosequence_set_id} ); return if ($biosequence_set_clause eq '-1'); #### Build BIOSEQUENCE_NAME constraint my $biosequence_name_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BS.biosequence_name", constraint_type=>"plain_text", constraint_name=>"BioSequence Name", constraint_value=>$parameters{biosequence_name_constraint} ); return if ($biosequence_name_clause eq '-1'); #### Build BIOSEQUENCE_ACCESSION constraint my $biosequence_accession_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BS.biosequence_accession", constraint_type=>"plain_text", constraint_name=>"BioSequence Accession", constraint_value=>$parameters{biosequence_accession_constraint} ); return if ($biosequence_accession_clause eq '-1'); #### Build GENE SYMBOL constraint my $gene_symbol_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BSA.gene_symbol", constraint_type=>"plain_text", constraint_name=>"Gene Symbol", constraint_value=>$parameters{gene_symbol_constraint} ); return if ($gene_symbol_clause eq '-1'); #### Build FULL GENE NAME constraint my $full_gene_name_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BSA.full_gene_name", constraint_type=>"plain_text", constraint_name=>"full Gene Name", constraint_value=>$parameters{full_gene_name_constraint} ); return if ($full_gene_name_clause eq '-1'); #### Build EC Numbers constraint if (defined($parameters{EC_number_constraint}) && $parameters{EC_number_constraint} && $parameters{EC_number_constraint} !~ /^(\!)*%.+%$/) { $parameters{EC_number_constraint} = '%'.$parameters{EC_number_constraint}.'%'; } my $EC_numbers_clause = $sbeams->parseConstraint2SQL( constraint_column=>"D.EC_numbers", constraint_type=>"plain_text", constraint_name=>"EC Number", constraint_value=>$parameters{EC_number_constraint} ); return if ($EC_numbers_clause eq '-1'); if ($EC_numbers_clause =~ /(\'[%\d\.\-]+\')/) { my $str = $1; $EC_numbers_clause =~ s/$str/$str OR BSA.EC_numbers LIKE $str/; } #### Build BIOSEQUENCE_DESC constraint my $biosequence_desc_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BS.biosequence_desc", constraint_type=>"plain_text", constraint_name=>"BioSequence Description", constraint_value=>$parameters{biosequence_desc_constraint} ); return if ($biosequence_desc_clause eq '-1'); #### Build BIOSEQUENCE_SEQ constraint my $biosequence_seq_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BS.biosequence_seq", constraint_type=>"plain_text", constraint_name=>"BioSequence Sequence", constraint_value=>$parameters{biosequence_seq_constraint} ); return if ($biosequence_seq_clause eq '-1'); $biosequence_seq_clause =~ s/\*/\%/g; #### Build MOLECULAR FUNCTION constraint my $molecular_function_clause = $sbeams->parseConstraint2SQL( constraint_column=>"MFA.annotation", constraint_type=>"plain_text", constraint_name=>"Molecular Function", constraint_value=>$parameters{molecular_function_constraint} ); return if ($molecular_function_clause eq '-1'); #### Build BIOLOGICAL PROCESS constraint my $biological_process_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BPA.annotation", constraint_type=>"plain_text", constraint_name=>"Biological Process", constraint_value=>$parameters{biological_process_constraint} ); return if ($biological_process_clause eq '-1'); #### Build CELLULAR COMPONENT constraint my $cellular_component_clause = $sbeams->parseConstraint2SQL( constraint_column=>"CCA.annotation", constraint_type=>"plain_text", constraint_name=>"Cellular Component", constraint_value=>$parameters{cellular_component_constraint} ); return if ($cellular_component_clause eq '-1'); #### Build INTERPRO PROTEIN DOMAIN constraint my $protein_domain_clause = $sbeams->parseConstraint2SQL( constraint_column=>"IPDA.annotation", constraint_type=>"plain_text", constraint_name=>"InterPro Protein Domain", constraint_value=>$parameters{protein_domain_constraint} ); return if ($protein_domain_clause eq '-1'); #### Build PROTEIN LENGTH constraint my $protein_length_clause = $sbeams->parseConstraint2SQL( constraint_column=>"DATALENGTH(BS.biosequence_seq)", constraint_type=>"flexible_int", constraint_name=>"Protein Length", constraint_value=>$parameters{protein_length_constraint} ); return if ($protein_length_clause eq '-1'); #### Build BIOSEQUENCE CATEGORY constraint my $biosequence_category_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BPS.category", constraint_type=>"text_list", constraint_name=>"Biosequence Category", constraint_value=>$parameters{biosequence_category_constraint} ); return if ($biosequence_category_clause eq '-1'); #### Build DOMAIN MATCH SOURCE constraint my $domain_match_source_clause = $sbeams->parseConstraint2SQL( constraint_column=>"DM.domain_match_source_id", constraint_type=>"int_list", constraint_name=>"Domain Match Source", constraint_value=>$parameters{domain_match_source_constraint} ); return if ($domain_match_source_clause eq '-1'); #### Build DOMAIN MATCH TYPE constraint my $domain_match_type_clause = $sbeams->parseConstraint2SQL( constraint_column=>"DM.domain_match_type_id", constraint_type=>"int_list", constraint_name=>"Domain Match Type", constraint_value=>$parameters{domain_match_type_constraint} ); return if ($domain_match_type_clause eq '-1'); #### Build DOMAIN MATCH NAME constraint my $domain_match_name_clause = $sbeams->parseConstraint2SQL( constraint_column=>"DM.match_name", constraint_type=>"plain_text", constraint_name=>"Domain Match Name", constraint_value=>$parameters{domain_match_name_constraint} ); return if ($domain_match_name_clause eq '-1'); #### Build MATCH ANNOTATION constraint my $match_annotation_clause = $sbeams->parseConstraint2SQL( constraint_column=>"DM.match_annotation", constraint_type=>"plain_text", constraint_name=>"Match Annotation", constraint_value=>$parameters{match_annotation_constraint} ); return if ($match_annotation_clause eq '-1'); #### Build E VALUE constraint my $e_value_clause = $sbeams->parseConstraint2SQL( constraint_column=>"DM.e_value", constraint_type=>"flexible_float", constraint_name=>"Expectation Value", constraint_value=>$parameters{e_value_constraint} ); return if ($e_value_clause eq '-1'); #### Build TRANSMEMBRANE CLASS constraint my $transmembrane_class_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BPS.transmembrane_class", constraint_type=>"text_list", constraint_name=>"Transmembrane Class", constraint_value=>$parameters{transmembrane_class_constraint} ); return if ($transmembrane_class_clause eq '-1'); #### Build NUMBER OF TRANSMEMBRANE REGIONS constraint my $n_transmembrane_regions_clause = $sbeams->parseConstraint2SQL( constraint_column=>"BPS.n_transmembrane_regions", constraint_type=>"flexible_int", constraint_name=>"Number of Transmembrane regions", constraint_value=>$parameters{n_transmembrane_regions_constraint} ); return if ($n_transmembrane_regions_clause eq '-1'); #### Build ROWCOUNT constraint $parameters{row_limit} = 30000 unless ($parameters{row_limit} > 0 && $parameters{row_limit}<=1000000); #### Since the number of rows get changes in postProcess, hack in some #### artificial inflation for later deflation my $server_row_limit = $parameters{row_limit}; my $final_row_limit = $parameters{row_limit}; if ($parameters{display_options} =~ /ApplyChilliFilter/i || $parameters{display_options} =~ /SeqWidth/i ) { $server_row_limit = 100000; } my $limit_clause = $sbeams->buildLimitClause( row_limit=>$server_row_limit); $limit_clause->{final_row_limit} = $final_row_limit; #### Define some variables needed to build the query my @column_array; #### If the user opted to see the GO columns, add them in my @additional_columns = (); if ( $parameters{display_options} =~ /ShowGOColumns/ || $molecular_function_clause.$biological_process_clause. $cellular_component_clause.$protein_domain_clause ) { @additional_columns = ( ["molecular_function","MFA.annotation","Molecular Function"], ["molecular_function_GO","MFA.external_accession","molecular_function_GO"], ["biological_process","BPA.annotation","Biological Process"], ["biological_process_GO","BPA.external_accession","biological_process_GO"], ["cellular_component","CCA.annotation","Cellular Component"], ["cellular_component_GO","CCA.external_accession","cellular_component_GO"], ["interpro_protein_domain","IPDA.annotation","InterPro Protein Domain"], ["interpro_protein_domain_GO","IPDA.external_accession","interpro_protein_domain_GO"], ); } #### If the user opted to see GO columns or provided some GO constraints, #### then join in the GO tables my $GO_join = ""; if ( $parameters{display_options} =~ /ShowGOColumns/ || $molecular_function_clause.$biological_process_clause. $cellular_component_clause.$protein_domain_clause ) { $GO_join = qq~ LEFT JOIN BioLink.dbo.biosequence_annotated_gene AG ON ( BS.biosequence_id = AG.biosequence_id ) LEFT JOIN BioLink.dbo.gene_annotation MFA ON ( AG.annotated_gene_id = MFA.annotated_gene_id AND MFA.gene_annotation_type_id = 1 AND MFA.idx = 0 ) LEFT JOIN BioLink.dbo.gene_annotation BPA ON ( AG.annotated_gene_id = BPA.annotated_gene_id AND BPA.gene_annotation_type_id = 2 AND BPA.idx = 0 ) LEFT JOIN BioLink.dbo.gene_annotation CCA ON ( AG.annotated_gene_id = CCA.annotated_gene_id AND CCA.gene_annotation_type_id = 3 AND CCA.idx = 0 ) LEFT JOIN BioLink.dbo.gene_annotation IPDA ON ( AG.annotated_gene_id = IPDA.annotated_gene_id AND IPDA.gene_annotation_type_id = 4 AND IPDA.idx = 0 ) ~; } #### Add in some extra columns if the user wants to see them my @tmhmm_columns = (); if ( $parameters{display_options} =~ /ShowExtraProteinProps/ ) { @tmhmm_columns = ( #["fav_codon_frequency","STR(BPS.fav_codon_frequency,10,3)","Favored Codon Frequency"], ["transmembrane_class","BPS.transmembrane_class","TMR Class"], ["n_transmembrane_regions","BPS.n_transmembrane_regions","Number of TMRs"], ); } #### Define the desired columns in the query #### [friendly name used in url_cols,SQL,displayed column title] @column_array = ( ["biosequence_set_name","BSS.set_name","Biosequence Set Name"], ["biosequence_name","BS.biosequence_name","Biosequence Name"], ["biosequence_accessor","DBX.accessor","biosequence_accessor"], ["biosequence_accessor_suffix","DBX.accessor_suffix","biosequence_accessor_suffix"], ["biosequence_accession","BS.biosequence_accession","Biosequence Accession"], ["gene_symbol","BSA.gene_symbol","Gene Symbol"], ["full_gene_name","BSA.full_gene_name","Full Gene Name"], ["protein_EC_numbers","BSA.EC_numbers","Protein EC Numbers"], @tmhmm_columns, ["isoelectric_point","STR(BPS.isoelectric_point,7,3)","pI"], ["match_source_name","DMS.domain_match_source_name","Match Source"], ["domain_match_index","DM.domain_match_index","Domain Index"], ["overall_probability","STR(DM.overall_probability,7,3)","Over- all P"], ["best_match_flag","DM.best_match_flag","BH"], ["cluster_name","DM.cluster_name","Cluster"], ["query_start","DM.query_start","Query Start"], ["query_end","DM.query_end","Query End"], ["query_length","DM.query_length","Query Length"], ["match_start","DM.match_start","Match Start"], ["match_end","DM.match_end","Match End"], ["match_length","DM.match_length","Match Length"], ["match_type_name","DMT.domain_match_type_name","Match Type"], ["match_name","DM.match_name","Match Name"], ["match_accession","DM.match_accession","match_accession"], ["match_EC_numbers","D.EC_numbers","Domain EC Numbers"], ["probability","STR(DM.probability,7,3)","Prob"], ["e_value","CONVERT(varchar(50),DM.e_value)","E value"], ["score","DM.score","Score"], ["z_score","DM.z_score","Z Score"], @additional_columns, ["second_match_name","DM.second_match_name","Second Reference"], ["second_match_accession","DM.second_match_accession","second_match_accession"], ["match_annotation","DM.match_annotation","Match Annotation"], ["chromosome","BPS.chromosome","Chromosome"], ["start_in_chromosome","BPS.start_in_chromosome","Start"], ["end_in_chromosome","BPS.end_in_chromosome","End"], ["biosequence_desc","BS.biosequence_desc","Biosequence Description"], ["project_id","BSS.project_id","project_id"], ["biosequence_set_id","BSS.biosequence_set_id","biosequence_set_id"], ["biosequence_id","BS.biosequence_id","biosequence_id"], ["biosequence_annotation_id","BSA.biosequence_annotation_id","biosequence_annotation_id"], ["match_accessor","TDBX.accessor","match_accessor"], ["match_accessor_suffix","TDBX.accessor_suffix","match_accessor_suffix"], ["second_match_accessor","TDBX2.accessor","second_match_accessor"], ["second_match_accessor_suffix","TDBX2.accessor_suffix","second_match_accessor_suffix"], ["organism_full_name","O.full_name","Organism"], ); #### Limit the width of the Reference column if user selected if ( $parameters{display_options} =~ /MaxRefWidth/ ) { $max_widths{'Reference'} = 20; } #### Set flag to display SQL statement if user selected if ( $parameters{display_options} =~ /ShowSQL/ ) { $show_sql = 1; } #### Build the columns part of the SQL statement my %colnameidx = (); my @column_titles = (); my $columns_clause = $sbeams->build_SQL_columns_list( column_array_ref=>\@column_array, colnameidx_ref=>\%colnameidx, column_titles_ref=>\@column_titles ); #### Define the SQL statement $sql = qq~ SELECT $limit_clause->{top_clause} $columns_clause FROM $TBPS_DOMAIN_MATCH DM LEFT JOIN $TBPS_DOMAIN_MATCH_SOURCE DMS ON ( DM.domain_match_source_id = DMS.domain_match_source_id ) LEFT JOIN $TBPS_DOMAIN_MATCH_TYPE DMT ON ( DM.domain_match_type_id = DMT.domain_match_type_id ) LEFT JOIN $TB_DBXREF TDBX ON ( DMT.dbxref_id = TDBX.dbxref_id ) LEFT JOIN $TBPS_DOMAIN_MATCH_TYPE DMT2 ON ( DM.second_match_type_id = DMT2.domain_match_type_id ) LEFT JOIN $TB_DBXREF TDBX2 ON ( DMT2.dbxref_id = TDBX2.dbxref_id ) LEFT JOIN $TBPS_DOMAIN D ON ( DM.match_name = D.domain_name ) LEFT JOIN $TBPS_BIOSEQUENCE_ANNOTATION BSA ON ( DM.biosequence_id = BSA.biosequence_id ) INNER JOIN $TBPS_BIOSEQUENCE BS ON ( DM.biosequence_id = BS.biosequence_id ) LEFT JOIN $TB_DBXREF DBX ON ( BS.dbxref_id = DBX.dbxref_id ) LEFT JOIN $TBPS_BIOSEQUENCE_SET BSS ON ( BS.biosequence_set_id = BSS.biosequence_set_id ) LEFT JOIN $TB_ORGANISM O ON ( BSS.organism_id = O.organism_id ) LEFT JOIN $TBPS_BIOSEQUENCE_PROPERTY_SET BPS ON ( BS.biosequence_id = BPS.biosequence_id ) $GO_join WHERE 1 = 1 $biosequence_set_clause $biosequence_name_clause $biosequence_accession_clause $biosequence_desc_clause $gene_symbol_clause $full_gene_name_clause $EC_numbers_clause $domain_match_source_clause $domain_match_type_clause $domain_match_name_clause $e_value_clause $match_annotation_clause $molecular_function_clause $biological_process_clause $cellular_component_clause $protein_domain_clause $biosequence_category_clause $transmembrane_class_clause $n_transmembrane_regions_clause ORDER BY BS.biosequence_name,DM.domain_match_index,DM.cluster_name,DM.query_start $limit_clause->{trailing_limit_clause} ~; #### If we want to show all data where there is a match, then do funky stuff if ($parameters{display_options} =~ /ShowAllMatches/) { my $sub_sql = $sql; my $new_sql = $sql; $sub_sql =~ s/SELECT.+FROM/SELECT DISTINCT DM.biosequence_id\n FROM/s; $sub_sql =~ s/ORDER BY .+$//s; $new_sql =~ s/WHERE 1 = 1.+ORDER BY/WHERE DM.biosequence_id IN \(\n\n$sub_sql\n\n\)\n ORDER BY/s; $sql = $new_sql; } #### Certain types of actions should be passed to links my $pass_action = "QUERY"; $pass_action = $apply_action if ($apply_action =~ /QUERY/i); #### Pass nearly all of the constraints down to a child query my @parameters_to_pass; my $parameters_list = ''; while ( ($key,$value) = each %input_types ) { if ($key ne 'sort_order' && $key ne 'display_options' && $key ne 'reference_constraint' && $key ne 'peptide_string_constraint' && $key ne 'charge_constraint' && $key ne 'xx_constraint' ) { if ($parameters{$key}) { push(@parameters_to_pass,"$key=$parameters{$key}"); } } } if (@parameters_to_pass) { $parameters_list = join('&',@parameters_to_pass); } #### Define the hypertext links for columns that need them %url_cols = ('Biosequence Accession' => "\%$colnameidx{biosequence_accessor}V\%$colnameidx{biosequence_accession}V", 'Biosequence Accession_ATAG' => 'TARGET="Win1" ', 'Biosequence Name' => "$CGI_BASE_DIR/ProteinStructure/GetDomainHits?QUERY_NAME=PS_GetDomainHit&biosequence_id=$parameters{biosequence_id}&biosequence_name_constraint=\%V&project_id=\%$colnameidx{project_id}V&biosequence_set_id=\%$colnameidx{biosequence_set_id}V&display_options=ShowExtraProteinProps&apply_action=$pass_action", 'Full Gene Name' => "$CGI_BASE_DIR/$SBEAMS_PART/ManageTable.cgi?TABLE_NAME=PS_biosequence_annotation&biosequence_annotation_id=\%$colnameidx{biosequence_annotation_id}V&biosequence_id=\%$colnameidx{biosequence_id}V&ShowEntryForm=1", 'Full Gene Name_ATAG' => 'TARGET="Win1"', 'Full Gene Name_ISNULL' => ' [Add] ', 'Gene Symbol' => "$CGI_BASE_DIR/$SBEAMS_PART/ManageTable.cgi?TABLE_NAME=PS_biosequence_annotation&biosequence_annotation_id=\%$colnameidx{biosequence_annotation_id}V&biosequence_id=\%$colnameidx{biosequence_id}V&ShowEntryForm=1", 'Gene Symbol_ATAG' => 'TARGET="Win1"', 'Match Name' => "\%$colnameidx{match_accessor}V\%$colnameidx{match_accession}V\%$colnameidx{match_accessor_suffix}V", 'Match Name_ATAG' => 'TARGET="WinExt" ', 'Protein EC Numbers' => "http://ca.expasy.org/cgi-bin/nicezyme.pl?\%V", 'Protein EC Numbers_ATAG' => 'TARGET="WinExt"', 'Protein EC Numbers_OPTIONS' => {semicolon_separated_list=>1}, 'Domain EC Numbers' => "http://ca.expasy.org/cgi-bin/nicezyme.pl?\%V", 'Domain EC Numbers_ATAG' => 'TARGET="WinExt"', 'Domain EC Numbers_OPTIONS' => {semicolon_separated_list=>1}, 'Query Start' => "$CGI_BASE_DIR/ProteinStructure/BrowseBioSequence.cgi?biosequence_name_constraint=\%$colnameidx{biosequence_name}V&biosequence_set_id=\%$colnameidx{biosequence_set_id}V&label_start=\%$colnameidx{query_start}V&label_end=\%$colnameidx{query_end}V&apply_action=HIDEQUERY&display_options=SequenceFormat,ShowExtraProteinProps&project_id=\%$colnameidx{project_id}V", 'Query Start_ATAG' => 'TARGET="Win1" ', 'Second Reference' => "\%$colnameidx{second_match_accessor}V\%$colnameidx{second_match_accession}V\%$colnameidx{second_match_accessor_suffix}V", 'Second Reference_ATAG' => 'TARGET="WinExt" ', ); #### Define columns that should be hidden in the output table %hidden_cols = ( 'biosequence_accessor' => 1, 'biosequence_accessor_suffix' => 1, 'match_accessor' => 1, 'match_accession' => 1, 'match_accessor_suffix' => 1, 'second_match_accessor' => 1, 'second_match_accession' => 1, 'second_match_accessor_suffix' => 1, 'molecular_function_GO' => 1, 'molecular_function_GO' => 1, 'biological_process_GO' => 1, 'cellular_component_GO' => 1, 'interpro_protein_domain_GO' => 1, 'project_id' => 1, 'biosequence_set_id' => 1, 'biosequence_id' => 1, 'biosequence_annotation_id' => 1, 'Organism' => 1, ); ######################################################################### #### If QUERY or VIEWRESULTSET was selected, display the data if ($apply_action =~ /QUERY/i || $apply_action eq "VIEWRESULTSET") { #### If the action contained QUERY, then fetch the results from #### the database if ($apply_action =~ /QUERY/i) { #### Show the SQL that will be or was executed $sbeams->display_sql(sql=>$sql) if ($show_sql); #### Fetch the results from the database server $sbeams->fetchResultSet( sql_query=>$sql, resultset_ref=>$resultset_ref, ); #### Post process the resultset if ($parameters{display_options} =~ /ApplyChilliFilter/i || $parameters{display_options} =~ /SeqWidth/i ) { postProcessResultset( rs_params_ref=>\%rs_params, resultset_ref=>$resultset_ref, query_parameters_ref=>\%parameters, column_titles_ref=>\@column_titles, final_row_limit => $limit_clause->{final_row_limit}, ); } #### Store the resultset and parameters to disk resultset cache $rs_params{set_name} = "SETME"; $sbeams->writeResultSet( resultset_file_ref=>\$rs_params{set_name}, resultset_ref=>$resultset_ref, query_parameters_ref=>\%parameters, resultset_params_ref=>\%rs_params, query_name=>"$SBEAMS_SUBDIR/$PROGRAM_FILE_NAME", column_titles_ref=>\@column_titles, ); } #### Provide column definition link print qq~ Additional Information: [Column Definitions]

~ if ($sbeams->output_mode() eq 'html'); #### Display the resultset $sbeams->displayResultSet( resultset_ref=>$resultset_ref, query_parameters_ref=>\%parameters, rs_params_ref=>\%rs_params, url_cols_ref=>\%url_cols, hidden_cols_ref=>\%hidden_cols, max_widths=>\%max_widths, column_titles_ref=>\@column_titles, base_url=>$base_url, ); #### Display the resultset controls $sbeams->displayResultSetControls( resultset_ref=>$resultset_ref, query_parameters_ref=>\%parameters, rs_params_ref=>\%rs_params, base_url=>$base_url, ); #### Display a plot of data from the resultset $sbeams->displayResultSetPlot( rs_params_ref=>\%rs_params, resultset_ref=>$resultset_ref, query_parameters_ref=>\%parameters, column_titles_ref=>\@column_titles, base_url=>$base_url, ); #### If QUERY was not selected, then tell the user to enter some parameters } else { if ($sbeams->invocation_mode() eq 'http') { print "

Select parameters above and press QUERY

\n"; } else { print "You need to supply some parameters to contrain the query\n"; } } } # end handle_request ############################################################################### # evalSQL # # Callback for translating Perl variables into their values, # especially the global table variables to table names ############################################################################### sub evalSQL { my $sql = shift; return eval "\"$sql\""; } # end evalSQL ############################################################################### # postProcessResultset # # Perform some additional processing on the resultset that would otherwise # be very awkward to do in SQL. ############################################################################### sub postProcessResultset { my %args = @_; my ($i,$element,$key,$value,$line,$result,$sql); #### Process the arguments list my $resultset_ref = $args{'resultset_ref'}; my $rs_params_ref = $args{'rs_params_ref'}; my $query_parameters_ref = $args{'query_parameters_ref'}; my $column_titles_ref = $args{'column_titles_ref'}; my $final_row_limit = $args{'final_row_limit'}; my %rs_params = %{$rs_params_ref}; my %parameters = %{$query_parameters_ref}; my $n_rows = scalar(@{$resultset_ref->{data_ref}}); my @new_data_array; #### Define a structure for some stats my %stats; $stats{single_domain_proteins} = []; $stats{multiple_domain_proteins} = []; $stats{n_proteins} = 0; my %surviving_proteins; # Proteins that survive the ChilliFilter #### Determine some column indices in the resultset my $column_indices = $resultset_ref->{column_hash_ref}; my $biosequence_name_column_index = $column_indices->{biosequence_name}; my $match_source_column_index = $column_indices->{match_source_name}; my $match_type_column_index = $column_indices->{match_type_name}; my $domain_match_index_column_index = $column_indices->{domain_match_index}; my $query_start_column_index = $column_indices->{query_start}; my $query_end_column_index = $column_indices->{query_end}; my $e_value_column_index = $column_indices->{e_value}; my $probability_column_index = $column_indices->{probability}; my $overall_probability_column_index = $column_indices->{overall_probability}; my $chromosome_column_index = $column_indices->{chromosome}; #### We will process in groups of rows where a group is one biosequence my $startrow = 0; my $endrow = 0; #### Loop over (groups of) rows in the resultset my @rows = @{$resultset_ref->{data_ref}}; #while ( $startrow < $n_rows-1) { Deutsch changed 2004-03-10 !!! while ( $startrow < $n_rows) { #### Get this biosequence_name my $biosequence_name = $rows[$startrow]->[$biosequence_name_column_index]; $stats{n_proteins}++; #### Set $endrow to the last row for this biosequence $endrow = $startrow; $endrow++ while ($endrow < $n_rows && $rows[$endrow]->[$biosequence_name_column_index] eq $biosequence_name); $endrow--; #### Now examine the details of this group of rows and decide what to do #### First count the number of domains present my %domain_indices = (); for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$domain_match_index_column_index] gt '') { $domain_indices{$rows[$i]->[$domain_match_index_column_index]} = 1; } } ## Will this work, or does keys return a list instead of array? my @domain_list = keys(%domain_indices); #print "Domains: ".join(",",@domain_list)."
\n"; my $n_domains = scalar(@domain_list); my $pdbblast_domains = 0; my $pfam_domains = 0; my $pfam_best_domains = 0; my $rosetta_success_domains = 0; my $rosetta_best_domains = 0; my $NgAnnots = 0; my $TIGRFAM_hits = 0; my $COGs_hits = 0; #### Look for imported annotations and show those for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$match_type_column_index] eq 'NgAnnot') { push(@new_data_array,$rows[$i]); $NgAnnots++; } } #### Look for TIGRFAM hits and show those for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$match_type_column_index] eq 'TIGRFAM') { push(@new_data_array,$rows[$i]); $TIGRFAM_hits++; } } #### Look for COGs hits and show those for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$match_type_column_index] eq 'COGs') { push(@new_data_array,$rows[$i]); $COGs_hits++; } } #### Look for pdbblast hits and store carefully my %pdbblast_domains = (); for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$match_type_column_index] eq 'pdbblast' || $rows[$i]->[$match_type_column_index] eq 'orfeus') { my $domain_match_index = $rows[$i]->[$domain_match_index_column_index]; my %attrs = ( query_start => $rows[$i]->[$query_start_column_index], query_end => $rows[$i]->[$query_end_column_index], row_number => $i, ); #### If we already have a result for this domain, squawk else store if (exists($pdbblast_domains{$domain_match_index})) { print "WARNING: duplicate pdbblast hits for $biosequence_name". "($domain_match_index)
\n"; } else { push(@new_data_array,$rows[$i]); $pdbblast_domains{$domain_match_index} = \%attrs; $pdbblast_domains++; } } } #### Look for pfam hits and store carefully my %pfam_domains = (); my $unknown_index_counter = 10; for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$match_type_column_index] eq 'pfam' && $rows[$i]->[$e_value_column_index] <= .1) { #### Get the domain_match_index my $domain_match_index = $rows[$i]->[$domain_match_index_column_index]; #### If it's empty, then, assign it a temporary one unless ($domain_match_index) { $domain_match_index = "$unknown_index_counter?"; $unknown_index_counter++; } my %attrs = ( query_start => $rows[$i]->[$query_start_column_index], query_end => $rows[$i]->[$query_end_column_index], row_number => $i, ); #### If we already have a pfam result for this domain, squawk if (exists($pfam_domains{$domain_match_index})) { print "WARNING: duplicate pfam hits for $biosequence_name ". "($domain_match_index)
\n"; #### If we already have a pdbblast result for this domain, drop this } elsif (exists($pdbblast_domains{$domain_match_index})) { print "INFO: We have pfam hit where pdbblast already exists ". "for $biosequence_name($domain_match_index)
\n" if ($VERBOSE); $attrs{new_row} = scalar(@new_data_array); push(@new_data_array,$rows[$i]); $pfam_domains{$domain_match_index} = \%attrs; $pfam_domains++; #### Else we probably have a new one, but a couple more checks apply } else { #### If this is a PFAM Search entry with no index, then see if #### there is a pdbblast entry in the same region of sequence my $found_a_match = 0; if ($domain_match_index =~ /\?/) { foreach my $index ( keys(%pdbblast_domains) ) { #### See if they overlap at all if ( ( $rows[$i]->[$query_start_column_index] >= $pdbblast_domains{$index}->{query_start} && $rows[$i]->[$query_start_column_index] < $pdbblast_domains{$index}->{query_end} ) || ( $rows[$i]->[$query_end_column_index] <= $pdbblast_domains{$index}->{query_end} && $rows[$i]->[$query_end_column_index] > $pdbblast_domains{$index}->{query_start} ) && $found_a_match == 0 ) { print "INFO: We have a pfam hit where pdbblast already exists ". "for $biosequence_name($domain_match_index)
\n" if ($VERBOSE); $attrs{new_row} = scalar(@new_data_array); push(@new_data_array,$rows[$i]); $pfam_domains{$domain_match_index} = \%attrs; $pfam_domains++; $found_a_match = 1; } } if ($found_a_match == 0) { $attrs{new_row} = scalar(@new_data_array); push(@new_data_array,$rows[$i]); $pfam_domains{$domain_match_index} = \%attrs; $pfam_domains++; $pfam_best_domains++; } #### Otherwise, this must be a Ginzu pfam hit. Make sure there isn't #### already a PFAM Search pfam hit. If so, replace that with this. } else { my $found_a_match = 0; foreach my $index ( keys(%pfam_domains) ) { #### See if they overlap at all if ( ( $rows[$i]->[$query_start_column_index] >= $pfam_domains{$index}->{query_start} && $rows[$i]->[$query_start_column_index] < $pfam_domains{$index}->{query_end} ) || ( $rows[$i]->[$query_end_column_index] <= $pfam_domains{$index}->{query_end} && $rows[$i]->[$query_end_column_index] > $pfam_domains{$index}->{query_start} ) && $found_a_match == 0 ) { print "INFO: Overlapping Ginzu pfam and PFAM Search pfam hit". "for $biosequence_name($domain_match_index). Keeping only Ginzu result.
\n" if ($VERBOSE); #### Replace the previous row $new_data_array[$pfam_domains{$index}->{new_row}] = $rows[$i]; #### Delete the previous entry in the hash and add this one delete($pfam_domains{$index}); $pfam_domains{$domain_match_index} = \%attrs; $found_a_match = 1; } } if ($found_a_match == 0) { push(@new_data_array,$rows[$i]); $pfam_domains{$domain_match_index} = \%attrs; $pfam_domains++; $pfam_best_domains++; } } } } } #### Examine the Rosetta domains my %rosetta_domains = (); my @rosetta_buffer = (); my $current_domain = -999; my $max_probability = 0; my $overall_probability = 0; my $domain_match_index; for (my $i=$startrow; $i<=$endrow; $i++) { if ($rows[$i]->[$match_source_column_index] eq 'mamSum') { $domain_match_index = $rows[$i]->[$domain_match_index_column_index]; #print "Rosetta domain match index: ".$domain_match_index.", current: ".$current_domain." for $biosequence_name
\n"; #### If this is a new batch of results if ($domain_match_index != $current_domain) { #### But not the beginning of the first batch if ($current_domain != -999) { PROCESS_LAST_MAMSUM: #### If we already have a result for this domain if (exists($rosetta_domains{$current_domain})) { print "WARNING: duplicate rosetta domains for $biosequence_name". "($current_domain). This should never happen. Sort err?
\n" if ($VERBOSE); } elsif (exists($pdbblast_domains{$current_domain})) { ## This is fine, just ignore the Rosetta stuff print "INFO: ignoring Rosetta hit where pdbblast already exists ". "for $biosequence_name($current_domain)
\n" if ($VERBOSE); } elsif (exists($pfam_domains{$current_domain})) { ## This is fine, just ignore the Rosetta stuff print "INFO: ignoring Rosetta hit where pfam already exists ". "for $biosequence_name($current_domain)
\n" if ($VERBOSE); } elsif ($max_probability >= 0.3 || $overall_probability >=0.35) { push(@new_data_array,@rosetta_buffer); $rosetta_domains{$current_domain} = 1; $rosetta_best_domains++; #print "Rosetta best: $biosequence_name($current_domain)
\n"; } else { ## Rosetta tried and failed. No match anywhere. Unknown domain... } #### If Rosetta succeeded, make a note regardless of other if ($max_probability >= 0.3 || $overall_probability >=0.35) { $rosetta_success_domains++; #print "Rosetta success: $biosequence_name($current_domain)
\n"; } } $current_domain = $domain_match_index; $max_probability = 0; $overall_probability = 0; @rosetta_buffer = (); } #### Add this row to the buffer unless we're in the hop-back unless ($i > $endrow) { push(@rosetta_buffer,$rows[$i]); my $probability = $rows[$i]->[$probability_column_index]; $max_probability = $probability if ($probability > $max_probability); $overall_probability = $rows[$i]->[$overall_probability_column_index]; } } } #### Hop back into the loop to process the last mamSum group if (scalar(@rosetta_buffer)) { goto PROCESS_LAST_MAMSUM; } #### Done processing this biosequence. Store some statistics #push(@proteins,$biosequence_name); if ($n_domains > 1) { push(@{$stats{multiple_domain_proteins}},$biosequence_name); $stats{multiple_domain_proteins_domains} += $n_domains; } else { push(@{$stats{single_domain_proteins}},$biosequence_name); $n_domains = 1; $stats{single_domain_proteins_domains} += $n_domains; } $stats{pbdblast_domains} += $pdbblast_domains; $stats{pfam_domains} += $pfam_domains; $stats{pfam_best_domains} += $pfam_best_domains; $stats{rosetta_success_domains} += $rosetta_success_domains; $stats{rosetta_best_domains} += $rosetta_best_domains; $stats{TIGRFAM_hits} += $TIGRFAM_hits; $stats{COGs_hits} += $COGs_hits; $stats{NgAnnots} += $NgAnnots; if ($pdbblast_domains + $pfam_best_domains + $rosetta_best_domains + $TIGRFAM_hits + $COGs_hits + $NgAnnots > 0) { #### Store the chromosome stats if (defined($rows[$startrow]->[$chromosome_column_index])) { $stats{chromosome}->{$rows[$startrow]->[$chromosome_column_index]}++; } } if ($pdbblast_domains + $pfam_best_domains + $rosetta_best_domains + $TIGRFAM_hits + $COGs_hits > 0) { $stats{proteins_with_an_id}++; $surviving_proteins{$biosequence_name} = 1; if ($pdbblast_domains + $pfam_best_domains + $rosetta_best_domains == $n_domains) { $stats{proteins_with_all_id}++; } } #### Set new start and continue $startrow = $endrow + 1; #### Check the final number of rows if ($final_row_limit) { if (scalar(@new_data_array) >= $final_row_limit) { @new_data_array = @new_data_array[0..($final_row_limit-1)]; last; } } } #### Report the statistics my $buf = "\n"; $buf.="Prefilter statistics:\n"; $buf.="Total number of proteins: $stats{n_proteins} (with some domain information from this query)\n"; $buf.="Number of single-domain proteins: ".scalar(@{$stats{single_domain_proteins}})."\n"; $buf.="Number of single-domain protein domains: $stats{single_domain_proteins_domains}\n"; $buf.="Number of multiple-domain proteins: ".scalar(@{$stats{multiple_domain_proteins}})."\n"; $buf.="Number of multiple-domain protein domains: $stats{multiple_domain_proteins_domains}\n"; $buf.="Total number of domains: ".($stats{single_domain_proteins_domains}+$stats{multiple_domain_proteins_domains})."\n"; $buf.="\n"; $buf.="Filtering statistics:\n"; $buf.="Number of proteins with at least one matched domain: $stats{proteins_with_an_id} (includes PDB, PFAM, Ginzu, Rosetta, TIGRFAM, COGs, etc.)\n"; $buf.="Number of proteins with all domains matched: $stats{proteins_with_all_id} (includes only PDB, PFAM, Ginzu, Rosetta)\n"; $buf.="Number of good PDB match domains: $stats{pbdblast_domains}\n"; $buf.="Number of good PFAM match but not PDB match domains: $stats{pfam_best_domains}\n"; $buf.="Number of good Rosetta match domains without PDB or PFAM matches: $stats{rosetta_best_domains}\n"; $buf.="Total number of matched domains: ".($stats{pbdblast_domains}+$stats{pfam_best_domains}+$stats{rosetta_best_domains})." (includes only PDB, PFAM, Ginzu, Rosetta)\n"; $buf.="\n"; $buf.="Number of good PFAM matches regardless of PDB matches: $stats{pfam_domains} (sometimes more than one match per domain!)\n"; $buf.="Number of good Rosetta matches regardless of PDB or PFAM matches: $stats{rosetta_success_domains}\n"; $buf.="Number of good TIGRFAM matches: $stats{TIGRFAM_hits} (sometimes more than one match per protein!)\n"; $buf.="Number of good COGs matches: $stats{COGs_hits} (sometimes more than one match per protein!)\n"; $buf.="Number of NgAnnots: $stats{NgAnnots}\n"; $buf.="\n"; #### We we were able to keep per-chromosome stats if (defined($stats{chromosome})) { $buf.="Number of proteins by replicon: (proteins surviving the Chilli Filter after ny \"hit\" including NgAnnot)\n"; foreach my $chromosome (sort(keys(%{$stats{chromosome}}))) { $buf .= "$chromosome\t".$stats{chromosome}->{$chromosome}."\n"; } $buf .= "\n"; } #### If there are any surviving proteins if ( defined(%surviving_proteins)) { $buf .= "Proteins that make the Chilli Filter cut:" . " (can be pasted up in Biosequence Name Constraint)\n"; my $cnt = 0; for my $k ( sort(keys(%surviving_proteins)) ) { $buf .= ( $cnt ) ? ";$k" : $k; $cnt++; if ( $cnt > 12 ) { $buf .= "\n"; $cnt = 0; } } } $buf.="\n"; $buf =~ s/\n/
\n/g; my $toggle = $sbeams->make_toggle_section ( content => $buf, visible => 1, textlink => 1, sticky => 0, imglink => 1, hidetext => 'Hide', showtext => 'Show', neutraltext => 'processing info', name => '_GetDomainHits',); print "$toggle

" if ($sbeams->output_mode() eq 'interactive' || $sbeams->output_mode() eq 'html'); #### Replace the old rowset with the new, modified rowset $resultset_ref->{data_ref} = \@new_data_array; return 1; } # end postProcessResult