Browse Source

When processing the html from an interProScan job, empty tables are removed from the output. Also, empty rows in tables are removed.

mestato 14 years ago
parent
commit
6698546c6e
1 changed files with 176 additions and 146 deletions
  1. 176 146
      tripal_analysis_interpro/tripal_analysis_interpro.module

+ 176 - 146
tripal_analysis_interpro/tripal_analysis_interpro.module

@@ -516,158 +516,188 @@ function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $p
 	$interval = intval($no_iterations * 0.01);
 	$idx_iterations = 0;
 
-	// Processed the tables
-	foreach ($tables as $table) {
-		if ($table->getName() == 'table') {
-			// Set job status
-			$idx_iterations ++;
-			if ($idx_iterations % $interval == 0) {
-				$percentage = (int) ($idx_iterations / $no_iterations * 100);
-				db_set_active($previous_db);
-				tripal_job_set_progress($job_id, $percentage);
-				$previous_db = db_set_active('chado');
-				print $percentage."% ";
-			}
-
-			// Get the first row and match its name with the feature name
-			$firsttd = $table->children()->children()->children();
-			$feature_id = 0;
-			foreach($firsttd as $b) {
-				foreach($b->children() as $a) {
-					if ($a->getName() == 'a') {
-						// Remove _ORF from the sequence name
-						$seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
-
-						// Find out how many features match this uniquename
-						$sql = "SELECT count(feature_id) FROM {feature} ".
-                         "WHERE uniquename = '%s' ";
-						$no_features = db_result(db_query($sql, $seqname));
-						 
-						// If there is only one match, get the feature_id
-						if ($no_features == 1) {
-							$sql = "SELECT feature_id FROM {feature} ".
-                            "WHERE uniquename = '%s' ";
-							$feature_id = db_result(db_query($sql, $seqname));
-
-							// If the uniquename matches more than one features then skip and print 'Ambiguous'
-						} else if ($no_features > 1) {
-							fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
-							continue;
-
-							// If the uniquename did not match, skip and print 'Failed'
-						} else {
-							fwrite($log, "Failed: ".$seqname."\n");
-						}
-
-					}
-				}
-			}
-			// Successfully matched. print 'Succeeded'. Add analysis_id and
-			// feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
-			if ($feature_id) {
-
-				//------------------------------------
-				// Insert into analysisfeature table
-				//------------------------------------
-				$sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
-                       "VALUES (%d, %d)";
-				db_query ($sql, $feature_id, $analysis_id);
-				
-			   // If a matched feature is found, write to log.
-            fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
-
-            // Get the higest rank for this feature_id in analysisfeatureprop table.
-            // If the value of the inserting content is not duplicate, add it to  
-            // analysisfeaturepro with 'higest_rank + 1'
-            $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
-                   "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".       
-                   "WHERE feature_id=%d ".
-                   "AND analysis_id=%d ".
-                   "AND type_id=%d ";
-
-            $afp =  db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
-            $hi_rank = 0;
-            if ($afp) {
-               $hi_rank = $afp->max + 1;
+    // Processed the tables
+    foreach ($tables as $table) {
+        //if (preg_match('/No hits reported/', $table->asXML()) ) {
+            //print "skipping this table b/c no hits are reported\n";
+        //}
+        // make sure we are looking at a table and its not an empty table
+        if ($table->getName() == 'table' && !preg_match('/No hits reported/', $table->asXML()) ) {
+            $idx_iterations ++;
+            if ($idx_iterations % $interval == 0) {
+                    $percentage = (int) ($idx_iterations / $no_iterations * 100);
+                    db_set_active($previous_db);
+                    tripal_job_set_progress($job_id, $percentage);
+                    $previous_db = db_set_active('chado');
+                    print $percentage."% ";
             }
-            
-				// Get the newly inserted analysisfeature_id
-				$sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
-				$analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
-				//------------------------------------------------------------
-				// Insert interpro html tags into analysisfeatureprop table
-				//------------------------------------------------------------
-				// Before inserting, make sure it's not a duplicate
-				$sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
-				$result = db_query($sql, $analysisfeature_id, $type_id);
-				$duplicate = 0;
-				while ($afp_value = db_fetch_object($result)) {
-				   if ($table->asXML() == $afp_value->value) {
-				      $duplicate = 1;
-				   }
-				}
-				if (!$duplicate) {
-				   $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
-                      "VALUES (%d, %d, '%s', %d)";
-				   db_query($sql, $analysisfeature_id, $type_id, $table->asXML(), $hi_rank);
-				   fwrite($log, " (Insert)\n"); // write to log
-				} else {
-				   fwrite($log, " (Skipped)\n");
-				}
-				
-			   // Parse GO terms. Make sure GO database schema is installed in chado
-            $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
-            if (!$go_db_id) {
-               print 'GO schema not installed in chado. GO terms are not processed.';
+
+            // Set job status
+            // Get the first row and match its name with the feature name
+            $firsttd = $table->children()->children()->children();
+            $feature_id = 0;
+            foreach($firsttd as $b) {
+                foreach($b->children() as $a) {
+                    if ($a->getName() == 'a') {
+                        // Remove _ORF from the sequence name
+                        $seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
+                        print "seqname is $seqname\n";
+
+                        // Find out how many features match this uniquename
+                        $sql = "SELECT count(feature_id) FROM {feature} ".
+                        "WHERE uniquename = '%s' ";
+                        $no_features = db_result(db_query($sql, $seqname));
+
+                        // If there is only one match, get the feature_id
+                        if ($no_features == 1) {
+                            $sql = "SELECT feature_id FROM {feature} ".
+                                    "WHERE uniquename = '%s' ";
+                            $feature_id = db_result(db_query($sql, $seqname));
+                            print "\tfeature id is $feature_id\n";
+
+                            // If the uniquename matches more than one features then skip and print 'Ambiguous'
+                        } else if ($no_features > 1) {
+                            fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
+                            continue;
+
+                        // If the uniquename did not match, skip and print 'Failed'
+                        } else {
+                            fwrite($log, "Failed: ".$seqname."\n");
+                        }
+
+                    }
+                }
             }
-            if ($go_db_id && $parsego) {
-               $trs = $table->children();
-               foreach ($trs as $tr) {
-                  $tds = $tr->children();
-                  foreach($tds as $td) {
-                     $gotags = $td->children();
-                     foreach ($gotags as $gotag) {
-                          // Look for 'GO:accession#'
-                        if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
-                        	
-                           // Find cvterm_id for the matched GO term
-                           $sql = "SELECT cvterm_id FROM {cvterm} CVT 
-                                   INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
-                                   WHERE DBX.accession = '%s' AND DBX.db_id = %d";
-                           $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
-                           
-                           //-------------------------------------------
-                           // Insert GO terms into feature_cvterm table
-                           //-------------------------------------------
-                           // Default pub_id = 1 (NULL) was used
-                           $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
-                                   VALUES (%d, %d, 1)";
-                           db_query($sql, $feature_id, $goterm_id);
-                                                     
-                           //------------------------------------------------
-                           // Insert GO terms into analysisfeatureprop table
-                           //------------------------------------------------
-                           $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
-                                  "VALUES (%d, %d, '%s', 0)";
-                           db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
-                           
+
+            // Successfully matched. print 'Succeeded'. Add analysis_id and
+            // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
+            if ($feature_id) {
+
+                //------------------------------------
+                // Clease unwanted rows from the table
+                //------------------------------------
+
+                $parent_row =   "/<tr><td valign=\"top\"><b>Parent<\/b><\/td>\s*<td valign=\"top\">\s*no.*?parent<\/td>\s*<\/tr>/";
+                $children_row = "/<tr><td valign=\"top\"><b>Children<\/b><\/td>\s*<td valign=\"top\">\s*no.*?children<\/td>\s*<\/tr>/";
+                $found_row    = "/<tr><td valign=\"top\"><b>Found.*?in<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
+                $contains_row = "/<tr><td valign=\"top\"><b>Contains<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
+                $go_row       = "/<tr><td valign=\"top\"><b>GO.*?terms<\/b><\/td>\s*<td valign=\"top\">\s*none<\/td>\s*<\/tr>/";
+
+                $table_txt = $table->asXML();
+                $table_txt = preg_replace($parent_row, "", $table_txt);
+                $table_txt = preg_replace($children_row, "", $table_txt);
+                $table_txt = preg_replace($found_row, "", $table_txt);
+                $table_txt = preg_replace($contains_row, "", $table_txt);
+                $table_txt = preg_replace($go_row, "", $table_txt);
+
+                //print "----------------------------\n";
+                //print "old: ".$table->asXML()."\n\n\n";
+                //print "----------------------------\n";
+                //print "Fixed: $table_txt\n";
+                //print "----------------------------\n";
+
+                //------------------------------------
+                // Insert into analysisfeature table
+                //------------------------------------
+                $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
+                        "VALUES (%d, %d)";
+                db_query ($sql, $feature_id, $analysis_id);
+
+                // If a matched feature is found, write to log.
+                fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
+
+                // Get the higest rank for this feature_id in analysisfeatureprop table.
+                // If the value of the inserting content is not duplicate, add it to  
+                // analysisfeaturepro with 'higest_rank + 1'
+                $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
+                        "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".
+                        "WHERE feature_id=%d ".
+                        "AND analysis_id=%d ".
+                        "AND type_id=%d ";
+
+                $afp =  db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
+                $hi_rank = 0;
+                if ($afp) {
+                    $hi_rank = $afp->max + 1;
+                }
+                // Get the newly inserted analysisfeature_id
+                $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
+                $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
+                //------------------------------------------------------------
+                // Insert interpro html tags into analysisfeatureprop table
+                //------------------------------------------------------------
+                // Before inserting, make sure it's not a duplicate
+                $sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
+                $result = db_query($sql, $analysisfeature_id, $type_id);
+                $duplicate = 0;
+                while ($afp_value = db_fetch_object($result)) {
+                    if ($table_txt == $afp_value->value) {
+                        $duplicate = 1;
+                    }
+                }
+                if (!$duplicate) {
+                    $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
+                            "VALUES (%d, %d, '%s', %d)";
+                    db_query($sql, $analysisfeature_id, $type_id, $table_txt, $hi_rank);
+                    fwrite($log, " (Insert)\n"); // write to log
+                    print "\twriting table\n";
+                } else {
+                   fwrite($log, " (Skipped)\n");
+                    print "\tskipping table - dup\n";
+                }
+
+                // Parse GO terms. Make sure GO database schema is installed in chado
+                $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
+                if (!$go_db_id) {
+                    print 'GO schema not installed in chado. GO terms are not processed.';
+                }
+                if ($go_db_id && $parsego) {
+                    $trs = $table->children();
+                    foreach ($trs as $tr) {
+                        $tds = $tr->children();
+                        foreach($tds as $td) {
+                            $gotags = $td->children();
+                            foreach ($gotags as $gotag) {
+                                // Look for 'GO:accession#'
+                                if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
+
+                                    // Find cvterm_id for the matched GO term
+                                    $sql = "SELECT cvterm_id FROM {cvterm} CVT 
+                                            INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
+                                            WHERE DBX.accession = '%s' AND DBX.db_id = %d";
+                                    $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
+
+                                    //-------------------------------------------
+                                    // Insert GO terms into feature_cvterm table
+                                    //-------------------------------------------
+                                    // Default pub_id = 1 (NULL) was used
+                                    $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
+                                            VALUES (%d, %d, 1)";
+                                    db_query($sql, $feature_id, $goterm_id);
+
+                                    //------------------------------------------------
+                                    // Insert GO terms into analysisfeatureprop table
+                                    //------------------------------------------------
+                                    $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
+                                            "VALUES (%d, %d, '%s', 0)";
+                                    db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
+                                }
+                            }
                         }
-                     }
-                  }
-               }
+                    }
+                }
             }
-				
-			}
-		}
-	}
-	db_set_active ($previous_db); // Use drupal database
-	print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
-	
-	fwrite($log, "\n");
-	fclose($log);
-   return;
+        }
+    }
+    db_set_active ($previous_db); // Use drupal database
+    print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
+
+    fwrite($log, "\n");
+    fclose($log);
+    return;
 }
 
+
 /*******************************************************************************
  * tripal_analysis_interpro_nodeapi()
  * HOOK: Implementation of hook_nodeapi()