zhangxudong 2 tahun lalu
induk
melakukan
71cd505a04
2 mengubah file dengan 29 tambahan dan 5 penghapusan
  1. 26 3
      abundance_estimates_to_matrix.pl
  2. 3 2
      run-featurecounts.R

+ 26 - 3
abundance_estimates_to_matrix.pl

@@ -95,6 +95,7 @@ else {
 1	counts
 2	fpkm
 3	tpm
+4	cpm
 ## RSEM:
 
 0       transcript_id
@@ -144,7 +145,7 @@ else {
     
     ;
 
-my ($acc_field, $counts_field, $fpkm_field, $tpm_field);
+my ($acc_field, $counts_field, $fpkm_field, $tpm_field, $cpm_field);
 
 if ($est_method =~ /^rsem$/i) {
     $acc_field = 0;
@@ -175,6 +176,7 @@ elsif ($est_method =~ /^featureCounts/) {
     $counts_field = "counts";
     $fpkm_field = "fpkm";
     $tpm_field = "tpm";
+    $cpm_field = "cpm";
 }
 else {
     die "Error, dont recognize --est_method [$est_method] ";
@@ -199,10 +201,12 @@ main: {
             my $count = $x[ $fields{$counts_field} ];
             my $fpkm = $x[ $fields{$fpkm_field} ];
             my $tpm = $x[ $fields{$tpm_field} ];
+            my $cpm = $x[ $fields{$cpm_field} ];
 
             $data{$acc}->{$file}->{count} = $count;
             $data{$acc}->{$file}->{FPKM} = $fpkm;
             $data{$acc}->{$file}->{TPM} = $tpm;
+            $data{$acc}->{$file}->{CPM} = $cpm;
         }
         close $fh;
     }
@@ -221,8 +225,10 @@ main: {
     
     my $counts_matrix_file = "$out_prefix.counts.matrix";
     my $TPM_matrix_file = "$out_prefix.TPM.not_cross_norm";
+    my $CPM_matrix_file = "$out_prefix.CPM.not_cross_norm";
     open (my $ofh_counts, ">$counts_matrix_file") or die "Error, cannot write file $counts_matrix_file";
     open (my $ofh_TPM, ">$TPM_matrix_file") or die "Error, cannot write file $TPM_matrix_file";
+    open (my $ofh_CPM, ">$CPM_matrix_file") or die "Error, cannot write file $CPM_matrix_file";
 
     { # check to see if they're unique
         my %filename_map = map { + $_ => 1 } @filenames;
@@ -243,11 +249,13 @@ main: {
 
     print $ofh_counts join("\t", "", @filenames) . "\n";
     print $ofh_TPM join("\t", "", @filenames) . "\n";
+    print $ofh_CPM join("\t", "", @filenames) . "\n";
 
     foreach my $acc (keys %data) {
         
         print $ofh_counts "$acc";
         print $ofh_TPM "$acc";
+        print $ofh_CPM "$acc";
         
         foreach my $file (@files) {
 
@@ -262,28 +270,43 @@ main: {
             else {
                 $tpm = "NA";
             }
+            my $cpm = $data{$acc}->{$file}->{CPM};
+            if (defined $cpm) {
+                $cpm = $cpm/1;
+            }
+            else {
+                $cpm = "NA";
+            }
+
 
             print $ofh_counts "\t$count";
             print $ofh_TPM "\t$tpm";
+            print $ofh_CPM "\t$cpm";
         }
         
         print $ofh_counts "\n";
         print $ofh_TPM "\n";
+        print $ofh_CPM "\n";
 
     }
     close $ofh_counts;
     close $ofh_TPM;
+    close $ofh_CPM;
     
 
     if (scalar @files > 1) {
         ## more than one sample 
         
         if ($cross_sample_norm =~ /^TMM$/i) {
-            my $cmd = "$FindBin::RealBin/support_scripts/run_TMM_scale_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.EXPR.matrix";
+            my $cmd = "$FindBin::RealBin/support_scripts/run_TMM_scale_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.TPM.matrix";
+            &process_cmd($cmd);
+            $cmd = "$FindBin::RealBin/support_scripts/run_TMM_scale_matrix.pl --matrix $CPM_matrix_file > $out_prefix.$cross_sample_norm.CPM.matrix";
             &process_cmd($cmd);
         }
         elsif ($cross_sample_norm =~ /^UpperQuartile$/) {
-            my $cmd = "$FindBin::RealBin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.EXPR.matrix";
+            my $cmd = "$FindBin::RealBin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.TPM.matrix";
+            &process_cmd($cmd);
+            $cmd = "$FindBin::RealBin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $CPM_matrix_file > $out_prefix.$cross_sample_norm.CPM.matrix";
             &process_cmd($cmd);
         }
         elsif ($cross_sample_norm =~ /^none$/i) {

+ 3 - 2
run-featurecounts.R

@@ -30,11 +30,12 @@ outCountsFilePath <- paste(outFilePref, '.count', sep = '');
 
 fCountsList = featureCounts(bamFile, annot.ext=gtfFile, isGTFAnnotationFile=TRUE, nthreads=nthreads, GTF.featureType=argv$featureType, GTF.attrType=argv$attrType, isPairedEnd=argv$isPairedEnd, strandSpecific=argv$strandSpecific)
 dgeList = DGEList(counts=fCountsList$counts, genes=fCountsList$annotation)
+cpm = cpm(dgeList)
 fpkm = rpkm(dgeList, dgeList$genes$Length)
 tpm = exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
 
 write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
 
-featureCounts = cbind(fCountsList$annotation[,1], fCountsList$counts, fpkm, tpm)
-colnames(featureCounts) = c('gene_id', 'counts', 'fpkm','tpm')
+featureCounts = cbind(fCountsList$annotation[,1], fCountsList$counts, fpkm, tpm, cpm)
+colnames(featureCounts) = c('gene_id', 'counts', 'fpkm','tpm', 'cpm')
 write.table(featureCounts, outCountsFilePath, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)