瀏覽代碼

When processing the html from an interProScan job, empty tables are removed from the output. Also, empty rows in tables are removed.

mestato 14 年之前
父節點
當前提交
6698546c6e
共有 1 個文件被更改,包括 176 次插入146 次删除
  1. 176 146
      tripal_analysis_interpro/tripal_analysis_interpro.module

+ 176 - 146
tripal_analysis_interpro/tripal_analysis_interpro.module

@@ -516,158 +516,188 @@ function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $p
 	$interval = intval($no_iterations * 0.01);
 	$idx_iterations = 0;
 
-	// Processed the tables
-	foreach ($tables as $table) {
-		if ($table->getName() == 'table') {
-			// Set job status
-			$idx_iterations ++;
-			if ($idx_iterations % $interval == 0) {
-				$percentage = (int) ($idx_iterations / $no_iterations * 100);
-				db_set_active($previous_db);
-				tripal_job_set_progress($job_id, $percentage);
-				$previous_db = db_set_active('chado');
-				print $percentage."% ";
-			}
-
-			// Get the first row and match its name with the feature name
-			$firsttd = $table->children()->children()->children();
-			$feature_id = 0;
-			foreach($firsttd as $b) {
-				foreach($b->children() as $a) {
-					if ($a->getName() == 'a') {
-						// Remove _ORF from the sequence name
-						$seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
-
-						// Find out how many features match this uniquename
-						$sql = "SELECT count(feature_id) FROM {feature} ".
-                         "WHERE uniquename = '%s' ";
-						$no_features = db_result(db_query($sql, $seqname));
-						 
-						// If there is only one match, get the feature_id
-						if ($no_features == 1) {
-							$sql = "SELECT feature_id FROM {feature} ".
-                            "WHERE uniquename = '%s' ";
-							$feature_id = db_result(db_query($sql, $seqname));
-
-							// If the uniquename matches more than one features then skip and print 'Ambiguous'
-						} else if ($no_features > 1) {
-							fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
-							continue;
-
-							// If the uniquename did not match, skip and print 'Failed'
-						} else {
-							fwrite($log, "Failed: ".$seqname."\n");
-						}
-
-					}
-				}
-			}
-			// Successfully matched. print 'Succeeded'. Add analysis_id and
-			// feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
-			if ($feature_id) {
-
-				//------------------------------------
-				// Insert into analysisfeature table
-				//------------------------------------
-				$sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
-                       "VALUES (%d, %d)";
-				db_query ($sql, $feature_id, $analysis_id);
-				
-			   // If a matched feature is found, write to log.
-            fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
-
-            // Get the higest rank for this feature_id in analysisfeatureprop table.
-            // If the value of the inserting content is not duplicate, add it to  
-            // analysisfeaturepro with 'higest_rank + 1'
-            $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
-                   "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".       
-                   "WHERE feature_id=%d ".
-                   "AND analysis_id=%d ".
-                   "AND type_id=%d ";
-
-            $afp =  db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
-            $hi_rank = 0;
-            if ($afp) {
-               $hi_rank = $afp->max + 1;
+    // Processed the tables
+    foreach ($tables as $table) {
+        //if (preg_match('/No hits reported/', $table->asXML()) ) {
+            //print "skipping this table b/c no hits are reported\n";
+        //}
+        // make sure we are looking at a table and its not an empty table
+        if ($table->getName() == 'table' && !preg_match('/No hits reported/', $table->asXML()) ) {
+            $idx_iterations ++;
+            if ($idx_iterations % $interval == 0) {
+                    $percentage = (int) ($idx_iterations / $no_iterations * 100);
+                    db_set_active($previous_db);
+                    tripal_job_set_progress($job_id, $percentage);
+                    $previous_db = db_set_active('chado');
+                    print $percentage."% ";
             }
-            
-				// Get the newly inserted analysisfeature_id
-				$sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
-				$analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
-				//------------------------------------------------------------
-				// Insert interpro html tags into analysisfeatureprop table
-				//------------------------------------------------------------
-				// Before inserting, make sure it's not a duplicate
-				$sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
-				$result = db_query($sql, $analysisfeature_id, $type_id);
-				$duplicate = 0;
-				while ($afp_value = db_fetch_object($result)) {
-				   if ($table->asXML() == $afp_value->value) {
-				      $duplicate = 1;
-				   }
-				}
-				if (!$duplicate) {
-				   $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
-                      "VALUES (%d, %d, '%s', %d)";
-				   db_query($sql, $analysisfeature_id, $type_id, $table->asXML(), $hi_rank);
-				   fwrite($log, " (Insert)\n"); // write to log
-				} else {
-				   fwrite($log, " (Skipped)\n");
-				}
-				
-			   // Parse GO terms. Make sure GO database schema is installed in chado
-            $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
-            if (!$go_db_id) {
-               print 'GO schema not installed in chado. GO terms are not processed.';
+
+            // Set job status
+            // Get the first row and match its name with the feature name
+            $firsttd = $table->children()->children()->children();
+            $feature_id = 0;
+            foreach($firsttd as $b) {
+                foreach($b->children() as $a) {
+                    if ($a->getName() == 'a') {
+                        // Remove _ORF from the sequence name
+                        $seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
+                        print "seqname is $seqname\n";
+
+                        // Find out how many features match this uniquename
+                        $sql = "SELECT count(feature_id) FROM {feature} ".
+                        "WHERE uniquename = '%s' ";
+                        $no_features = db_result(db_query($sql, $seqname));
+
+                        // If there is only one match, get the feature_id
+                        if ($no_features == 1) {
+                            $sql = "SELECT feature_id FROM {feature} ".
+                                    "WHERE uniquename = '%s' ";
+                            $feature_id = db_result(db_query($sql, $seqname));
+                            print "\tfeature id is $feature_id\n";
+
+                            // If the uniquename matches more than one features then skip and print 'Ambiguous'
+                        } else if ($no_features > 1) {
+                            fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
+                            continue;
+
+                        // If the uniquename did not match, skip and print 'Failed'
+                        } else {
+                            fwrite($log, "Failed: ".$seqname."\n");
+                        }
+
+                    }
+                }
             }
-            if ($go_db_id && $parsego) {
-               $trs = $table->children();
-               foreach ($trs as $tr) {
-                  $tds = $tr->children();
-                  foreach($tds as $td) {
-                     $gotags = $td->children();
-                     foreach ($gotags as $gotag) {
-                          // Look for 'GO:accession#'
-                        if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
-                        	
-                           // Find cvterm_id for the matched GO term
-                           $sql = "SELECT cvterm_id FROM {cvterm} CVT 
-                                   INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
-                                   WHERE DBX.accession = '%s' AND DBX.db_id = %d";
-                           $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
-                           
-                           //-------------------------------------------
-                           // Insert GO terms into feature_cvterm table
-                           //-------------------------------------------
-                           // Default pub_id = 1 (NULL) was used
-                           $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
-                                   VALUES (%d, %d, 1)";
-                           db_query($sql, $feature_id, $goterm_id);
-                                                     
-                           //------------------------------------------------
-                           // Insert GO terms into analysisfeatureprop table
-                           //------------------------------------------------
-                           $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
-                                  "VALUES (%d, %d, '%s', 0)";
-                           db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
-                           
+
+            // Successfully matched. print 'Succeeded'. Add analysis_id and
+            // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
+            if ($feature_id) {
+
+                //------------------------------------
+                // Clease unwanted rows from the table
+                //------------------------------------
+
+                $parent_row =   "/<tr><td valign=\"top\"><b>Parent<\/b><\/td>\s*<td valign=\"top\">\s*no.*?parent<\/td>\s*<\/tr>/";
+                $children_row = "/<tr><td valign=\"top\"><b>Children<\/b><\/td>\s*<td valign=\"top\">\s*no.*?children<\/td>\s*<\/tr>/";
+                $found_row    = "/<tr><td valign=\"top\"><b>Found.*?in<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
+                $contains_row = "/<tr><td valign=\"top\"><b>Contains<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
+                $go_row       = "/<tr><td valign=\"top\"><b>GO.*?terms<\/b><\/td>\s*<td valign=\"top\">\s*none<\/td>\s*<\/tr>/";
+
+                $table_txt = $table->asXML();
+                $table_txt = preg_replace($parent_row, "", $table_txt);
+                $table_txt = preg_replace($children_row, "", $table_txt);
+                $table_txt = preg_replace($found_row, "", $table_txt);
+                $table_txt = preg_replace($contains_row, "", $table_txt);
+                $table_txt = preg_replace($go_row, "", $table_txt);
+
+                //print "----------------------------\n";
+                //print "old: ".$table->asXML()."\n\n\n";
+                //print "----------------------------\n";
+                //print "Fixed: $table_txt\n";
+                //print "----------------------------\n";
+
+                //------------------------------------
+                // Insert into analysisfeature table
+                //------------------------------------
+                $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
+                        "VALUES (%d, %d)";
+                db_query ($sql, $feature_id, $analysis_id);
+
+                // If a matched feature is found, write to log.
+                fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
+
+                // Get the higest rank for this feature_id in analysisfeatureprop table.
+                // If the value of the inserting content is not duplicate, add it to  
+                // analysisfeaturepro with 'higest_rank + 1'
+                $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
+                        "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".
+                        "WHERE feature_id=%d ".
+                        "AND analysis_id=%d ".
+                        "AND type_id=%d ";
+
+                $afp =  db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
+                $hi_rank = 0;
+                if ($afp) {
+                    $hi_rank = $afp->max + 1;
+                }
+                // Get the newly inserted analysisfeature_id
+                $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
+                $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
+                //------------------------------------------------------------
+                // Insert interpro html tags into analysisfeatureprop table
+                //------------------------------------------------------------
+                // Before inserting, make sure it's not a duplicate
+                $sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
+                $result = db_query($sql, $analysisfeature_id, $type_id);
+                $duplicate = 0;
+                while ($afp_value = db_fetch_object($result)) {
+                    if ($table_txt == $afp_value->value) {
+                        $duplicate = 1;
+                    }
+                }
+                if (!$duplicate) {
+                    $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
+                            "VALUES (%d, %d, '%s', %d)";
+                    db_query($sql, $analysisfeature_id, $type_id, $table_txt, $hi_rank);
+                    fwrite($log, " (Insert)\n"); // write to log
+                    print "\twriting table\n";
+                } else {
+                   fwrite($log, " (Skipped)\n");
+                    print "\tskipping table - dup\n";
+                }
+
+                // Parse GO terms. Make sure GO database schema is installed in chado
+                $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
+                if (!$go_db_id) {
+                    print 'GO schema not installed in chado. GO terms are not processed.';
+                }
+                if ($go_db_id && $parsego) {
+                    $trs = $table->children();
+                    foreach ($trs as $tr) {
+                        $tds = $tr->children();
+                        foreach($tds as $td) {
+                            $gotags = $td->children();
+                            foreach ($gotags as $gotag) {
+                                // Look for 'GO:accession#'
+                                if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
+
+                                    // Find cvterm_id for the matched GO term
+                                    $sql = "SELECT cvterm_id FROM {cvterm} CVT 
+                                            INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
+                                            WHERE DBX.accession = '%s' AND DBX.db_id = %d";
+                                    $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
+
+                                    //-------------------------------------------
+                                    // Insert GO terms into feature_cvterm table
+                                    //-------------------------------------------
+                                    // Default pub_id = 1 (NULL) was used
+                                    $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
+                                            VALUES (%d, %d, 1)";
+                                    db_query($sql, $feature_id, $goterm_id);
+
+                                    //------------------------------------------------
+                                    // Insert GO terms into analysisfeatureprop table
+                                    //------------------------------------------------
+                                    $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
+                                            "VALUES (%d, %d, '%s', 0)";
+                                    db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
+                                }
+                            }
                         }
-                     }
-                  }
-               }
+                    }
+                }
             }
-				
-			}
-		}
-	}
-	db_set_active ($previous_db); // Use drupal database
-	print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
-	
-	fwrite($log, "\n");
-	fclose($log);
-   return;
+        }
+    }
+    db_set_active ($previous_db); // Use drupal database
+    print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
+
+    fwrite($log, "\n");
+    fclose($log);
+    return;
 }
 
+
 /*******************************************************************************
  * tripal_analysis_interpro_nodeapi()
  * HOOK: Implementation of hook_nodeapi()