#!/usr/local/bin/perl
###############################################################################
# $Id$
#
# SBEAMS is Copyright (C) 2000-2021 Institute for Systems Biology
# This program is governed by the terms of the GNU General Public License (GPL)
# version 2 as published by the Free Software Foundation. It is provided
# WITHOUT ANY WARRANTY. See the full description of GPL terms in the
# LICENSE file distributed with this software.
###############################################################################
###############################################################################
# Get the script set up with everything it will need
###############################################################################
use strict;
use CGI::Carp qw(fatalsToBrowser croak);
use Data::Dumper;
use lib qw (../../lib/perl);
use vars qw ($q $sbeams $sbeamsMOD $PROG_NAME
$current_contact_id $current_username $glyco_query_o);
use SBEAMS::Connection qw($q $log);
use SBEAMS::Connection::Settings;
use SBEAMS::Connection::Tables;
use SBEAMS::Connection::DataTable;
use SBEAMS::BioLink::Tables;
use SBEAMS::BioLink::MSF;
use SBEAMS::PeptideAtlas;
use SBEAMS::PeptideAtlas::Settings;
use SBEAMS::PeptideAtlas::Tables;
use SBEAMS::Connection::TabMenu;
###############################################################################
# Global Variables
###############################################################################
#
$sbeams = new SBEAMS::Connection;
$sbeamsMOD = new SBEAMS::PeptideAtlas;
$sbeamsMOD->setSBEAMS($sbeams);
$sbeams->setSBEAMS_SUBDIR($SBEAMS_SUBDIR);
{ # Main
# Authenticate or exit
$current_username = $sbeams->Authenticate(allow_anonymous_access=>1) || exit;
#### Read in the default input parameters
my %params;
$sbeams->parse_input_parameters( q=>$q, parameters_ref=>\%params );
$sbeams->processStandardParameters(parameters_ref=>\%params);
## get project_id to send to HTMLPrinter display
my $project_id = $sbeams->getCurrent_project_id();
my $page = $sbeams->getGifSpacer( 800 ) . "
\n";
my $tabMenu = $sbeamsMOD->getTabMenu(
parameters_ref => \%params,
program_name => 'viewOrthologs',
);
$page .= "$tabMenu";
$sbeamsMOD->display_page_header(project_id => $project_id);
$log->debug( "Begin page:" .time() );
# Use default if requested
if ( $params{use_default} ) {
$params{group_id} = 'OG5_126932';
}
if ( $params{group_id} ) {
$page .= get_ortholog_display( %params );
} elsif ( $params{list_groups} ) {
$page .= get_ortholog_list( %params );
} else {
$page .= $sbeams->makeErrorText( "Missing required parameter group_id" );
}
$log->debug( "Page done:" .time() );
# Display page
print $page;
$sbeamsMOD->display_page_footer();
} # end main
sub get_ortholog_display {
my %args = @_;
# Content scalar to return
my $content = '';
# check params
for my $arg ( qw( group_id ) ) {
unless ( $args{$arg} ) {
$content .= $sbeams->makeErrorText("Missing required parameter: $arg
");
return $content;
}
}
# Highlight if available.
$args{entry_accession} ||= '';
# limit by bioseq_id?
my $bioseq_clause = '';
if( $args{restore} ) {
$log->info( "Restoring" );
$args{bioseq_id} = '';
} elsif( $args{bioseq_id} && !$args{search_groups} ) {
$bioseq_clause = "AND B.biosequence_id IN ( $args{bioseq_id} )";
}
my $all_projects = join( ",", $sbeams->getAccessibleProjects() );
my $curr_bid = $sbeamsMOD->getCurrentAtlasBuildID( parameters_ref => \%args );
# my $null_clause = ( $args{show_all} ) ? 'OR search_key_type IS NULL' : '';
# SQL to fetch ortholog groups and bioseqs in them.
my $sql =<<" END_SQL";
SELECT DISTINCT ortholog_group,
biosequence_name,
organism_name,
search_key_name,
CAST( biosequence_seq AS VARCHAR(4000) ),
biosequence_id,
B.biosequence_set_id,
ORG.organism_id
FROM biolink.dbo.ortholog O
JOIN $TBAT_BIOSEQUENCE B ON O.entry_accession = B.biosequence_name
JOIN $TBAT_ATLAS_BUILD AB ON AB.biosequence_set_id = B.biosequence_set_id
JOIN $TB_ORGANISM ORG ON O.ortholog_organism_id = ORG.organism_id
LEFT JOIN $TBAT_SEARCH_KEY SK ON SK.resource_name = B.biosequence_name
WHERE ortholog_group = '$args{group_id}'
-- AND AB.project_id IN ( $all_projects )
AND ( search_key_type = 'Full Name'
OR search_key_type = 'Description'
OR search_key_type LIKE '%Symbol'
OR search_key_type LIKE 'Annotation ID'
OR search_key_type IS NULL
OR search_key_type = 'RefSeq'
OR search_key_type = 'ORF Name'
)
$bioseq_clause
-- AND ORG.organism_id = 38
ORDER BY organism_name, B.biosequence_name ASC
END_SQL
# 0 ortholog_group,
# 1 biosequence_name,
# 2 organism_name,
# 3 search_key_name,
# 4 CAST( biosequence_seq AS VARCHAR(4000) ),
# 5 biosequence_id
# 6 biosequence_set_id
# 7 organism_id
$log->debug( "prepare stmt: $sql " . time() );
my $sth = $sbeams->get_statement_handle( $sql );
$log->debug( "prepare stmt done: " .time() );
# array of protein/ortholog info
my @all_proteins;
# hash of biosequence_ids -> seq or name
my %bioseq_id2seq;
my %bioseq_id2name;
# hash seq <=> accession
my %seq2acc;
my %acc2seq;
# Store (degenerate) acc -> bioseq_id
my %acc2bioseq_id;
# Store organism for each biosequence set
my %bss2org;
my %organisms;
# Counter
my $cnt = 0;
while ( my @row = $sth->fetchrow_array() ) {
$organisms{$row[7]}++;
my $seq = $row[4];
$seq =~ s/[^a-zA-Z]//g;
$bss2org{$row[6]} ||= $row[2];
# Check this out later for dups...
$seq2acc{$seq} ||= {};
$seq2acc{$seq}->{$row[1]}++;
$bioseq_id2seq{$row[5]} = $row[4];
$bioseq_id2name{$row[5]} = $row[1];
$acc2bioseq_id{$row[1]} ||= [];
push @{$acc2bioseq_id{$row[1]}}, $row[5];
# Cache first seq for each accession, and push onto display
unless ( $acc2seq{$row[1]} ) {
$acc2seq{$row[1]} = $seq;
# push row info for display
push @all_proteins, \@row;
}
$cnt++;
}
$log->debug( "Iterated $cnt rows: " .time() );
# not quite working yet!
my %dup_seqs;
my $dup_char = 'A';
# for my $seq ( keys( %seq2acc ) ) {
for my $prot ( @all_proteins ) {
my $seq = $prot->[4];
# $log->debug( "SEQ is $seq" );
next unless scalar(keys(%{$seq2acc{$seq}})) > 1;
for my $acc ( keys ( %{$seq2acc{$seq}} ) ) {
next if $dup_seqs{$acc};
$dup_seqs{$acc} = $dup_char;
}
$dup_char++;
}
# Get stringified version for IN clause.
my $bioseq_ids = join( ",", sort{ $a <=> $b } keys( %bioseq_id2name ) );
if ( !$bioseq_ids ) {
return "
makeInfoText( "No matching sequences found for ortholog group $args{group_id}" );
}
my $organisms = join( ',', keys( %organisms ) );
my $default_builds = get_default_builds( $all_projects, $organisms );
# We will get just one build per organism for now #Punt
# my $accessible_builds = get_accessible_builds( $all_projects, $organisms );
# Fixed order list of builds - first use default builds, then any
# accessible build
my @build_ids = sort { $default_builds->{$a} cmp $default_builds->{$b} } keys( %{$default_builds} );
# push @build_ids, sort { $accessible_builds->{$a} cmp $accessible_builds->{$b} } keys( %{$accessible_builds} );
my %bseen;
my $build_str;
for my $id ( @build_ids ) {
next if $bseen{$id}++;
my $sep = ( $build_str ) ? ',' : '';
$build_str .= $sep . $id;
}
# hash of info keyed by build_id
my %build_seqs;
# To be passed to checkbox list routine.
my %build_info;
my %view_org2build;
my %view_build2org;
# Skip non-default builds, this var controls debug stmt
my %skippy;
if ( $build_str ) {
# Define SQL to fetch info about builds in which a peptide has been observed
my $sql =<<" ENDSQL";
SELECT distinct
PI.peptide_instance_id,
n_observations,
PM.matched_biosequence_id,
PI.atlas_build_id,
atlas_build_name,
peptide_sequence,
AB.biosequence_set_id
-- AB.biosequence_set_id,
-- O.organism_name
FROM $TBAT_PEPTIDE_MAPPING PM
JOIN $TBAT_PEPTIDE_INSTANCE PI ON PI.peptide_instance_id = PM.peptide_instance_id
JOIN $TBAT_PEPTIDE P ON ( PI.peptide_id = P.peptide_id )
JOIN $TBAT_ATLAS_BUILD AB ON ( PI.atlas_build_id = AB.atlas_build_id )
-- JOIN $TBAT_BIOSEQUENCE_SET BS ON ( AB.biosequence_set_id = BS.biosequence_set_id )
-- JOIN $TB_ORGANISM O ON ( O.organism_id = BS.organism_id )
WHERE PM.matched_biosequence_id IN ( $bioseq_ids )
AND AB.atlas_build_id IN ( $build_str )
ORDER BY PI.atlas_build_id DESC
-- ORDER BY organism_name ASC, PI.atlas_build_id DESC
ENDSQL
$log->debug( "prepare instance stmt: " . time() );
# $log->debug( $sql );
my $sth = $sbeams->get_statement_handle( $sql );
$log->debug( "prepare instance stmt done: " .time() );
# 0 count(distinct PI.peptide_id),
# 1 sum( n_observations ),
# 2 PM.matched_biosequence_id,
# 3 PI.atlas_build_id,
# 4 atlas_build_name,
# 5 peptide_sequence
# 6 fizzle
# 7 biosequence_set_id
# 0 new PI.peptide_instance_id,
# 1 new n_observations,
# 2 new PM.matched_biosequence_id,
# 3 new PI.atlas_build_id,
# 4 new atlas_build_name,
# 5 new peptide_sequence,
# 6 new biosequence_set_id,
$cnt = 0;
# Loop to calculate coverate, per prot n_obs for each build
$log->debug( "Iterate, calc coverage: " .time() );
while ( my @row = $sth->fetchrow_array() ) {
# Effectively eliminate non-default builds
# if ( !$default_builds->{$row[3]} ) {
# $log->debug( "skipping $row[3] 'cause its not default" ) if !$skippy{$row[3]}++;
# next;
# }
my $org = $bss2org{$row[6]};
$build_info{$row[3]} ||= {};
$build_info{$row[3]}->{name} = $row[4];
$build_info{$row[3]}->{org} = $org;
$build_info{$row[3]}->{is_default} = ( $row[6] ) ? 'yes' : 'no';
$build_info{$row[3]}->{is_curr} = ( $row[3] == $curr_bid ) ? 1 : 0;
$build_info{$row[3]}->{bss} = $row[7];
# Simplistic initial view mechanism
if ( !$view_org2build{$org} ) {
$build_info{$row[3]}->{visible}++;
$build_info{$row[3]}->{view}++;
$view_org2build{$org} = $row[3];
$view_build2org{$row[3]} = $org;
# $log->debug( "View build for $org is $row[3] ( $row[4] )" );
} elsif ( $row[3] == $curr_bid ) {
$build_info{$row[3]}->{visible}++;
$build_info{$row[3]}->{view}++;
# Gotta unset the build
my $deprecated = $view_org2build{$org};
# $log->debug( "Replacing info from $build_info{$deprecated}->{name} with info from $build_info{$row[3]}->{name}, 'cause its current!" );
$build_info{$deprecated}->{visible} = 0;
$build_info{$deprecated}->{view} = 0;
delete $view_build2org{$deprecated};
$view_org2build{$org} = $row[3];
$view_build2org{$row[3]} = $org;
}
# error cases, shouldn't happen?
next unless $row[0];
# if ( $row[0] > 1 ) {
# $log->warn( "count peptide_id > 1" );
# $log->warn( join( ", ", @row ) );
# }
$build_seqs{$row[3]} ||= {};
$build_seqs{$row[3]}->{$row[2]} ||= {};
$build_seqs{$row[3]}->{$row[2]}->{count}++;
$build_seqs{$row[3]}->{$row[2]}->{n_obs} += $row[1];
# Calc coverage for this build/protein combination
$build_seqs{$row[3]}->{$row[2]}->{coverage} ||= {};
my $posn = $sbeamsMOD->get_site_positions( pattern => $row[5],
seq => $bioseq_id2seq{$row[2]} );
for my $p ( @$posn ) {
for ( my $i = 0; $i < length($row[5]); $i++ ){
my $covered_posn = $p + $i;
$build_seqs{$row[3]}->{$row[2]}->{coverage}->{$covered_posn}++;
}
}
# Going to cache with accession for ease of use!
$build_seqs{$row[3]}->{$bioseq_id2name{$row[2]}}->{coverage} = $build_seqs{$row[3]}->{$row[2]}->{coverage};
$build_seqs{$row[3]}->{$bioseq_id2name{$row[2]}}->{bioseq} = $row[2];
$cnt++;
}
$log->debug( "Iterated $cnt rows: " .time() );
}
# Table headings
my @headings = ( 'Accession', 'Organism' );
# Index of build col in display row, 2 already added above
my $b_cnt = 2;
my %build_posn;
$log->debug( "Iterate, prepare display: " .time() );
# Seen builds skipped, allows us to use composite of default and accessible.
my %seen;
for my $build_id ( @build_ids ) {
next if $seen{$build_id}++;
$build_posn{$b_cnt++} = $build_id;
my $dag = '';
my $display_name = $build_info{$build_id}->{name};
# $display_name =~ s/Yeast|Mouse|Human|Drosophila//;
$display_name =~ s/Peptide//;
$display_name =~ s/Atlas//;
$display_name =~ s/P\s*>*0.\d//;
$display_name =~ s/^_//;
$display_name =~ s/_$//;
$build_info{$build_id}->{display_name} = $display_name;
if ( $view_build2org{$build_id} ) {
$dag = '
The table below shows information about homologous(ortholog/paralog) proteins in the $group_link ortholog group. The groups were computed by OrthoMCL version 5, and are based on BLAST homology. The numbers shown in the columns under each build are the total number of observations of peptides that can be mapped to a given protein, and the total number of distinct peptides seen: total observations ( number distinct peptides ).
$alignment_overview
Alignment was created by the ClustalW program, which is maintained at the Conway Institute UCD Dublin. The alignment considers physical properties of the amino acids in the protein sequence, and the program was invoked using the following command-line: clustalw -tree -align -outorder=input -infile=clustal_file The consensus 'sequence' uses symbols to represent the level of conservation of amino acids at any given position. The text below, adapted from the clustal documentation, describes the various symbols used. CONSENSUS SYMBOLS: An alignment will display by default the following symbols denoting the degree of conservation observed in each column: "*" means that the residues or nucleotides in that column are identical in all sequences in the alignment. ":" means that conserved substitutions have been observed "." means that semi-conserved substitutions are observed. " " means that substitutions are not conservative.