run_TMM_scale_matrix.pl 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #!/usr/bin/env perl
  2. use strict;
  3. use warnings;
  4. use Carp;
  5. use Getopt::Long qw(:config no_ignore_case bundling);
  6. use Cwd;
  7. use FindBin;
  8. use File::Basename;
  9. use lib ("$FindBin::RealBin/../../PerlLib");
  10. use Data::Dumper;
  11. my $usage = <<__EOUSAGE__;
  12. #################################################################################################
  13. #
  14. # Required:
  15. #
  16. # --matrix <string> matrix of raw read counts (not normalized!)
  17. #
  18. ################################################################################################
  19. __EOUSAGE__
  20. ;
  21. my $matrix_file;
  22. my $help_flag;
  23. &GetOptions ( 'h' => \$help_flag,
  24. 'matrix=s' => \$matrix_file,
  25. );
  26. if ($help_flag) {
  27. die $usage;
  28. }
  29. unless ($matrix_file) {
  30. die $usage;
  31. }
  32. main: {
  33. my $tmm_info_file = &run_TMM($matrix_file);
  34. &write_normalized_file($matrix_file, $tmm_info_file);
  35. exit(0);
  36. }
  37. ####
  38. sub run_TMM {
  39. my ($counts_matrix_file) = @_;
  40. my $tmm_norm_script = "__tmp_runTMM.R";
  41. open (my $ofh, ">$tmm_norm_script") or die "Error, cannot write to $tmm_norm_script";
  42. #print $ofh "source(\"$FindBin::RealBin/R/edgeR_funcs.R\")\n";
  43. print $ofh "library(edgeR)\n\n";
  44. print $ofh "rnaseqMatrix = read.table(\"$counts_matrix_file\", header=T, row.names=1, com='', check.names=F)\n";
  45. print $ofh "rnaseqMatrix = as.matrix(rnaseqMatrix)\n";
  46. print $ofh "rnaseqMatrix = round(rnaseqMatrix)\n";
  47. print $ofh "exp_study = DGEList(counts=rnaseqMatrix, group=factor(colnames(rnaseqMatrix)))\n";
  48. print $ofh "exp_study = calcNormFactors(exp_study)\n";
  49. print $ofh "exp_study\$samples\$eff.lib.size = exp_study\$samples\$lib.size * exp_study\$samples\$norm.factors\n";
  50. print $ofh "write.table(exp_study\$samples, file=\"$counts_matrix_file.TMM_info.txt\", quote=F, sep=\"\\t\", row.names=F)\n";
  51. close $ofh;
  52. &process_cmd("Rscript $tmm_norm_script 1>&2 ");
  53. return("$counts_matrix_file.TMM_info.txt");
  54. }
  55. ####
  56. sub process_cmd {
  57. my ($cmd) = @_;
  58. print STDERR "CMD: $cmd\n";
  59. my $ret = system($cmd);
  60. if ($ret) {
  61. die "Error, cmd: $cmd died with ret ($ret) ";
  62. }
  63. return;
  64. }
  65. ####
  66. sub write_normalized_file {
  67. my ($matrix_file, $tmm_info_file) = @_;
  68. my %col_to_eff_lib_size;
  69. my %col_to_norm_factor;
  70. open (my $fh, $tmm_info_file) or die "Error, cannot open file $tmm_info_file";
  71. my %bloom_to_col;
  72. my $header = <$fh>;
  73. while (<$fh>) {
  74. chomp;
  75. my @x = split(/\t/);
  76. my ($col, $norm_factor, $eff_lib_size) = ($x[0], $x[2], $x[3]);
  77. $col =~ s/\"//g;
  78. my $bloom = $col;
  79. $bloom =~ s/\W/$;/g;
  80. $col_to_eff_lib_size{$col} = $eff_lib_size;
  81. $col_to_norm_factor{$col} = $norm_factor;
  82. if ($bloom ne $col) {
  83. if (exists $bloom_to_col{$bloom}) {
  84. die "Error, already stored $bloom_to_col{$bloom} for $bloom, but trying to also store $col here... Ensure column names are unique according to non-word characters.";
  85. }
  86. $col_to_eff_lib_size{$bloom} = $eff_lib_size;
  87. $col_to_norm_factor{$bloom} = $norm_factor;
  88. }
  89. }
  90. close $fh;
  91. open ($fh, $matrix_file);
  92. $header = <$fh>;
  93. chomp $header;
  94. my @pos_to_col = split(/\t/, $header);
  95. my $check_column_ordering_flag = 0;
  96. while (<$fh>) {
  97. chomp;
  98. my @x = split(/\t/);
  99. unless ($check_column_ordering_flag) {
  100. if (scalar(@x) == scalar(@pos_to_col) + 1) {
  101. ## header is offset, as is acceptable by R
  102. ## not acceptable here. fix it:
  103. unshift (@pos_to_col, "");
  104. }
  105. $check_column_ordering_flag = 1;
  106. print join("\t", @pos_to_col) . "\n";
  107. }
  108. my $gene = $x[0];
  109. print $gene;
  110. for (my $i = 1; $i <= $#x; $i++) {
  111. my $col = $pos_to_col[$i];
  112. $col =~ s/\"//g;
  113. my $adj_col = $col;
  114. $adj_col =~ s/-/\./g;
  115. my $bloom = $col;
  116. $bloom =~ s/\W/$;/g;
  117. my $eff_lib_size = $col_to_eff_lib_size{$col}
  118. || $col_to_eff_lib_size{$bloom}
  119. || $col_to_eff_lib_size{$adj_col}
  120. || $col_to_eff_lib_size{"X$col"}
  121. || die "Error, no eff lib size for [$col] or [$bloom] or [$adj_col] or [\"X$col\"]" . Dumper(\%col_to_eff_lib_size);
  122. my $norm_factor = $col_to_norm_factor{$col}
  123. || $col_to_norm_factor{$bloom}
  124. || $col_to_norm_factor{$adj_col}
  125. || $col_to_norm_factor{"X$col"}
  126. || die "Error, no normalization scaling factor for $col" . Dumper(\%col_to_norm_factor);
  127. my $read_count = $x[$i];
  128. my $converted_val = sprintf("%.3f", $read_count * 1/$norm_factor);
  129. print "\t$converted_val";
  130. }
  131. print "\n";
  132. }
  133. return;
  134. }