Browse Source

Add functions to extract keywords for the interpro html output

ccheng 14 years ago
parent
commit
35eb57cf9f

+ 1 - 0
tripal_analysis_interpro/tripal_analysis_interpro.install

@@ -54,6 +54,7 @@ function tripal_analysis_interpro_add_cvterms(){
    tripal_add_cvterms('analysis_interpro_query_re','The regular expression for finding the feature name in the query definition line of the InterPro results');
    tripal_add_cvterms('analysis_interpro_query_type','The feature type (e.g. mRNA, polypeptide) of the query input file. This type is used to identify the query feature when multiple features have the same name');
    tripal_add_cvterms('analysis_interpro_query_uniquename','Indicates if the matched name in the query definition line of the blast results is feature uniquename');
+   tripal_add_cvterms('analysis_interpro_output_keywords', 'Selected keywords for interpro html output which are indexed for search.');
 }
 /**
  * 

+ 79 - 0
tripal_analysis_interpro/tripal_analysis_interpro.module

@@ -114,6 +114,11 @@ function chado_analysis_interpro_form ($node){
       '#description' => t('Check the box to load GO terms to chado database'),
       '#default_value' => $parsego
    );
+   $form['interpro']['interprokeywordjob'] = array(
+      '#type' => 'checkbox',
+      '#title' => t('Submit a job to extract keywords from the Interpro html output'),
+      '#description' => t('Note: Interpro results are only searchable after keywords are extracted. Do not run this twice if you have already done so.'),
+	);
 	$form['interpro']['interproparameters'] = array(
       '#title' => t('Parameters'),
       '#type' => 'textfield',
@@ -281,6 +286,14 @@ function chado_analysis_interpro_submit_job($node){
 			drupal_set_message("Can not open interpro output file. Job not scheduled.");
 		}
 	}
+	
+	// Add a job if the user wants to the keywords from the HTML output
+	if ($node->interprokeywordjob) {
+		$analysis_id = chado_get_id_for_node('analysis', $node);
+		$job_args[0] = $analysis_id;
+		tripal_add_job("Extract keywords for search: $node->analysisname",'tripal_analysis_interpro',
+                           'tripal_analysis_interpro_extract_keywords', $job_args, $user->uid);
+	}
 }
 
 
@@ -611,3 +624,69 @@ function chado_analysis_interpro_access($op, $node, $account){
    return FALSE;
 }
 
+/*******************************************************************************
+ * Parsing Interpro HTML results that are stored in analysisfeatureprop for 
+ * searching 
+ * */
+function tripal_analysis_interpro_extract_keywords ($analysis_id) {	
+	
+	print "Extracting keywords...\n";
+	// Get all interpro output hits except for records with 'No hits reported', 'parent', 'children'.
+	$output_type_id = tripal_get_cvterm_id('analysis_interpro_output_hit');
+	$sql = "SELECT AFP.analysisfeature_id, AFP.value FROM {analysisfeatureprop} AFP 
+					 INNER JOIN {analysisfeature} AF ON AF.analysisfeature_id = AFP.analysisfeature_id
+					 WHERE type_id = $output_type_id 
+					 AND AF.analysis_id = $analysis_id
+					 AND value NOT like '%No hits reported.%' 
+           		 AND value NOT like '%parent%' 
+           		 AND value NOT like '%children%'";
+	$results = chado_query($sql);
+	$keyword_type_id = tripal_get_cvterm_id('analysis_interpro_output_keywords');
+	// Define what to be extracted in the array
+	$search = array (
+							"'SEQUENCE:.*'",
+							"'CRC64:.*'",
+							"'LENGTH:.*'",
+							"'<b>InterPro<br/>'",
+							"'<br/>Domain|Family\n'",
+							"'<td>no description</td>'",
+                 		"'<[/!]*?[^<>]*?>'si",          // replace HTML tags with a space
+							"'\n'", // replace newlines with a space
+	);
+	$replace = array (
+								"",
+								"",
+								"",
+								"",
+								"", 
+								"",
+					  			" ",
+                 			" ",
+	);
+	while ($record = db_fetch_object($results)) {
+		$af_id = $record->analysisfeature_id;
+		$value = $record->value;
+
+		// Retrive keywords for this analysisfeature_id if there is any
+		$sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id =$af_id AND type_id = $keyword_type_id";
+		$keywords = db_result(chado_query($sql));
+		
+		// Extract new keywords from the interpro html output
+		$text = preg_replace($search, $replace, $value); 
+		$new_keywords = trim(ereg_replace(' +', ' ', $text)); // remove extra spaces
+		
+		// Append the new keywords 
+		if ($keywords) {
+			$new_keywords = "$keywords $new_keywords";
+			$sql = "UPDATE {analysisfeatureprop} SET value = '$new_keywords' WHERE analysisfeature_id =$af_id AND type_id = $keyword_type_id ";
+		} else {
+			// Insert the keyword into the analysisfeatureprop table
+			$sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) VALUES ($af_id, $keyword_type_id, '$new_keywords', 0)";
+		}
+		chado_query($sql);
+
+	}
+	
+	print "Finished.\n";
+	
+}