123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156 |
- #!/usr/bin/env perl
- use strict;
- use warnings;
- use FindBin;
- use lib ("$FindBin::RealBin/../PerlLib");
- use Fasta_reader;
- use BHStats;
- my $usage = "\n\nusage: $0 corset.fasta\n\n";
- my $fasta_file = $ARGV[0] or die $usage;
- main: {
- my $fasta_reader = new Fasta_reader($fasta_file);
-
- my @all_seq_lengths;
- my $number_transcripts = 0;
-
- my $num_GC = 0;
-
- my %component_to_longest_isoform;
-
- my $tot_seq_len = 0;
- my $missing_gene_ids_flag = 0;
-
- while (my $seq_obj = $fasta_reader->next()) {
- my $acc = $seq_obj->get_accession();
-
- $number_transcripts++;
- my $comp_id = $acc;
- if (! $missing_gene_ids_flag) {
-
- if ($acc =~ /^(Cluster-\d+.\d+)_/) {
- $comp_id = $1;
- }
- else {
- print STDERR "Error, cannot decipher gene identifier from acc: $acc";
- $missing_gene_ids_flag = 1;
- }
- }
-
- my $sequence = $seq_obj->get_sequence();
- my $seq_len = length($sequence);
- $tot_seq_len += $seq_len;
- if ( (! exists $component_to_longest_isoform{$comp_id})
- ||
- $component_to_longest_isoform{$comp_id} < $seq_len) {
-
- $component_to_longest_isoform{$comp_id} = $seq_len;
- }
-
- push (@all_seq_lengths, $seq_len);
-
- while ($sequence =~ /[gc]/ig) {
- $num_GC++;
- }
- }
- print "\n\n";
- print "################################\n";
- print "## Counts of transcripts, etc.\n";
- print "################################\n";
- print "Total corset 'clusters':\t" . scalar(keys %component_to_longest_isoform) . "\n";
- print "Total trinity transcripts:\t" . $number_transcripts . "\n";
-
- my $pct_gc = sprintf("%.2f", $num_GC / $tot_seq_len * 100);
-
- print "Percent GC: $pct_gc\n\n";
-
- print "########################################\n";
- print "Stats based on ALL transcript contigs:\n";
- print "########################################\n\n";
- &report_stats(@all_seq_lengths);
- print "\n\n";
-
- if ($missing_gene_ids_flag) {
- print " - note: not reporting gene-based longest isoform info since couldn't parse Trinity accession info.\n";
- }
- else {
- print "#####################################################\n";
- print "## Stats based on ONLY LONGEST ISOFORM per 'GENE':\n";
- print "#####################################################\n\n";
-
- &report_stats(values %component_to_longest_isoform);
-
- print "\n\n\n";
- }
-
-
- exit(0);
- }
- ####
- sub report_stats {
- my (@seq_lengths) = @_;
- @seq_lengths = reverse sort {$a<=>$b} @seq_lengths;
-
- my $cum_seq_len = 0;
- foreach my $len (@seq_lengths) {
- $cum_seq_len += $len;
- }
-
- for (my $i = 10; $i <= 50; $i += 10) {
- my $cum_len_needed = $cum_seq_len * $i/100;
- my $N_val = &get_contigNvalue($cum_len_needed, \@seq_lengths);
- print "\tContig N$i: $N_val\n";
- }
- print "\n";
-
- my $median_len = &BHStats::median(@seq_lengths);
- print "\tMedian contig length: $median_len\n";
- my $avg_len = sprintf("%.2f", &BHStats::avg(@seq_lengths));
- print "\tAverage contig: $avg_len\n";
-
-
- print "\tTotal assembled bases: $cum_seq_len\n";
- return;
- }
- sub get_contigNvalue {
- my ($cum_len_needed, $seq_lengths_aref) = @_;
-
- my $partial_sum_len = 0;
- foreach my $len (@$seq_lengths_aref) {
- $partial_sum_len += $len;
- if ($partial_sum_len >= $cum_len_needed) {
- return($len);
- }
- }
- return -1; # shouldn't happen.
- }
|