123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
- #!/usr/bin/env perl
- use strict;
- use warnings;
- use Carp;
- use Getopt::Long qw(:config no_ignore_case bundling);
- use Cwd;
- use FindBin;
- use File::Basename;
- use lib ("$FindBin::RealBin/../../PerlLib");
- use Data::Dumper;
- my $usage = <<__EOUSAGE__;
- #################################################################################################
- #
- # Required:
- #
- # --matrix <string> matrix of raw read counts (not normalized!)
- #
- ################################################################################################
- __EOUSAGE__
- ;
- my $matrix_file;
- my $help_flag;
- &GetOptions ( 'h' => \$help_flag,
- 'matrix=s' => \$matrix_file,
- );
- if ($help_flag) {
- die $usage;
- }
- unless ($matrix_file) {
- die $usage;
- }
- main: {
-
- my $tmm_info_file = &run_TMM($matrix_file);
-
- &write_normalized_file($matrix_file, $tmm_info_file);
-
- exit(0);
- }
- ####
- sub run_TMM {
- my ($counts_matrix_file) = @_;
-
- my $tmm_norm_script = "__tmp_runTMM.R";
- open (my $ofh, ">$tmm_norm_script") or die "Error, cannot write to $tmm_norm_script";
- #print $ofh "source(\"$FindBin::RealBin/R/edgeR_funcs.R\")\n";
-
- print $ofh "library(edgeR)\n\n";
-
- print $ofh "rnaseqMatrix = read.table(\"$counts_matrix_file\", header=T, row.names=1, com='', check.names=F)\n";
- print $ofh "rnaseqMatrix = as.matrix(rnaseqMatrix)\n";
- print $ofh "rnaseqMatrix = round(rnaseqMatrix)\n";
- print $ofh "exp_study = DGEList(counts=rnaseqMatrix, group=factor(colnames(rnaseqMatrix)))\n";
- print $ofh "exp_study = calcNormFactors(exp_study)\n";
-
- print $ofh "exp_study\$samples\$eff.lib.size = exp_study\$samples\$lib.size * exp_study\$samples\$norm.factors\n";
- print $ofh "write.table(exp_study\$samples, file=\"$counts_matrix_file.TMM_info.txt\", quote=F, sep=\"\\t\", row.names=F)\n";
-
- close $ofh;
- &process_cmd("Rscript $tmm_norm_script 1>&2 ");
-
- return("$counts_matrix_file.TMM_info.txt");
- }
- ####
- sub process_cmd {
- my ($cmd) = @_;
- print STDERR "CMD: $cmd\n";
- my $ret = system($cmd);
- if ($ret) {
- die "Error, cmd: $cmd died with ret ($ret) ";
- }
- return;
- }
- ####
- sub write_normalized_file {
- my ($matrix_file, $tmm_info_file) = @_;
- my %col_to_eff_lib_size;
- my %col_to_norm_factor;
- open (my $fh, $tmm_info_file) or die "Error, cannot open file $tmm_info_file";
-
- my %bloom_to_col;
- my $header = <$fh>;
- while (<$fh>) {
- chomp;
- my @x = split(/\t/);
- my ($col, $norm_factor, $eff_lib_size) = ($x[0], $x[2], $x[3]);
-
- $col =~ s/\"//g;
-
- my $bloom = $col;
- $bloom =~ s/\W/$;/g;
-
- $col_to_eff_lib_size{$col} = $eff_lib_size;
- $col_to_norm_factor{$col} = $norm_factor;
-
- if ($bloom ne $col) {
- if (exists $bloom_to_col{$bloom}) {
- die "Error, already stored $bloom_to_col{$bloom} for $bloom, but trying to also store $col here... Ensure column names are unique according to non-word characters.";
- }
- $col_to_eff_lib_size{$bloom} = $eff_lib_size;
- $col_to_norm_factor{$bloom} = $norm_factor;
- }
-
- }
- close $fh;
-
- open ($fh, $matrix_file);
- $header = <$fh>;
- chomp $header;
- my @pos_to_col = split(/\t/, $header);
- my $check_column_ordering_flag = 0;
-
- while (<$fh>) {
- chomp;
- my @x = split(/\t/);
-
- unless ($check_column_ordering_flag) {
- if (scalar(@x) == scalar(@pos_to_col) + 1) {
- ## header is offset, as is acceptable by R
- ## not acceptable here. fix it:
- unshift (@pos_to_col, "");
- }
- $check_column_ordering_flag = 1;
- print join("\t", @pos_to_col) . "\n";
- }
-
- my $gene = $x[0];
-
- print $gene;
- for (my $i = 1; $i <= $#x; $i++) {
- my $col = $pos_to_col[$i];
-
- $col =~ s/\"//g;
-
- my $adj_col = $col;
- $adj_col =~ s/-/\./g;
-
- my $bloom = $col;
- $bloom =~ s/\W/$;/g;
- my $eff_lib_size = $col_to_eff_lib_size{$col}
- || $col_to_eff_lib_size{$bloom}
- || $col_to_eff_lib_size{$adj_col}
- || $col_to_eff_lib_size{"X$col"}
- || die "Error, no eff lib size for [$col] or [$bloom] or [$adj_col] or [\"X$col\"]" . Dumper(\%col_to_eff_lib_size);
-
- my $norm_factor = $col_to_norm_factor{$col}
- || $col_to_norm_factor{$bloom}
- || $col_to_norm_factor{$adj_col}
- || $col_to_norm_factor{"X$col"}
- || die "Error, no normalization scaling factor for $col" . Dumper(\%col_to_norm_factor);
-
- my $read_count = $x[$i];
-
- my $converted_val = sprintf("%.3f", $read_count * 1/$norm_factor);
-
- print "\t$converted_val";
- }
- print "\n";
- }
-
- return;
- }
|