12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004 |
- <?php
- function tripal_feature_fasta_load_form() {
- $form['fasta_file'] = array('#type' => 'textfield','#title' => t('FASTA File'),
- '#description' => t('Please enter the full system path for the FASTA file, or a path within the Drupal
- installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
- server on which this Drupal instance is running.'),'#required' => TRUE
- );
-
- $sql = "SELECT * FROM {organism} ORDER BY genus, species";
- $org_rset = chado_query($sql);
- $organisms = array();
- $organisms[''] = '';
- while ($organism = $org_rset->fetchObject()) {
- $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
- }
- $form['organism_id'] = array('#title' => t('Organism'),'#type' => t('select'),
- '#description' => t("Choose the organism to which these sequences are associated"),
- '#required' => TRUE,'#options' => $organisms
- );
-
- $values = array('name' => 'sequence');
- $cv = chado_select_record('cv', array('cv_id'), $values);
- $cv_id = $cv[0]->cv_id;
- $form['seqtype'] = array('#type' => 'textfield','#title' => t('Sequence Type'),
- '#required' => TRUE,
- '#description' => t('Please enter the Sequence Ontology (SO) term name that describes the sequences in the FASTA file (e.g. gene, mRNA, polypeptide, etc...)'),
- '#autocomplete_path' => "admin/tripal/storage/chado/auto_name/cvterm/$cv_id"
- );
- $form['method'] = array('#type' => 'radios','#title' => 'Method','#required' => TRUE,
- '#options' => array(t('Insert only'),t('Update only'),t('Insert and update')
- ),
- '#description' => t('Select how features in the FASTA file are handled.
- Select "Insert only" to insert the new features. If a feature already
- exists with the same name or unique name and type then it is skipped.
- Select "Update only" to only update featues that already exist in the
- database. Select "Insert and Update" to insert features that do
- not exist and upate those that do.'),'#default_value' => 2
- );
- $form['match_type'] = array('#type' => 'radios','#title' => 'Name Match Type','#required' => TRUE,
- '#options' => array(t('Name'),t('Unique name')
- ),
- '#description' => t('Used for "updates only" or "insert and update" methods. Not required if method type is "insert".
- Feature data is stored in Chado with both a human-readable
- name and a unique name. If the features in your FASTA file are uniquely identified using
- a human-readable name then select the "Name" button. If your features are
- uniquely identified using the unique name then select the "Unique name" button. If you
- loaded your features first using the GFF loader then the unique name of each
- features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
- By default, the FASTA loader will use the first word (character string
- before the first space) as the name for your feature. If
- this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
- Additionally, you may import both a name and a unique name for each sequence using the advanced options.'),
- '#default_value' => 1
- );
- $form['analysis'] = array('#type' => 'fieldset','#title' => t('Analysis Used to Derive Features'),
- '#collapsed' => TRUE
- );
- $form['analysis']['desc'] = array(
- '#markup' => t("Why specify an analysis for a data load? All data comes
- from some place, even if downloaded from Genbank. By specifying
- analysis details for all data uploads, it allows an end user to reproduce the
- data set, but at least indicates the source of the data.")
- );
-
- $sql = "SELECT * FROM {analysis} ORDER BY name";
- $org_rset = chado_query($sql);
- $analyses = array();
- $analyses[''] = '';
- while ($analysis = $org_rset->fetchObject()) {
- $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
- }
- $form['analysis']['analysis_id'] = array('#title' => t('Analysis'),'#type' => t('select'),
- '#description' => t("Choose the analysis to which these features are associated"),
- '#required' => TRUE,'#options' => $analyses
- );
-
- $form['advanced'] = array('#type' => 'fieldset','#title' => t('Advanced Options'),
- '#collapsible' => TRUE,'#collapsed' => TRUE
- );
- $form['advanced']['re_help'] = array('#type' => 'item',
- '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
- Your FASTA file may contain both a human-readable name and a unique name for each sequence.
- If you want to import
- both the name and unique name for all sequences, then you must provide regular expressions
- so that the loader knows how to separate them.
- Otherwise the name and uniquename will be the same.
- By default, this loader will use the first word in the definition
- lines of the FASTA file
- as the name or unique name of the feature.')
- );
- $form['advanced']['re_name'] = array('#type' => 'textfield',
- '#title' => t('Regular expression for the name'),'#required' => FALSE,
- '#description' => t('Enter the regular expression that will extract the
- feature name from the FASTA definition line. For example, for a
- defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
- the regular expression for the name would be, "^(.*?)\|.*$". All FASTA
- definition lines begin with the ">" symbol. You do not need to incldue
- this symbol in your regular expression.')
- );
- $form['advanced']['re_uname'] = array('#type' => 'textfield',
- '#title' => t('Regular expression for the unique name'),'#required' => FALSE,
- '#description' => t('Enter the regular expression that will extract the
- feature name from the FASTA definition line. For example, for a
- defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
- the regular expression for the unique name would be "^.*?\|(.*)$"). All FASTA
- definition lines begin with the ">" symbol. You do not need to incldue
- this symbol in your regular expression.')
- );
-
- $form['advanced']['db'] = array('#type' => 'fieldset',
- '#title' => t('External Database Reference'),'#weight' => 6,'#collapsed' => TRUE
- );
- $form['advanced']['db']['re_accession'] = array('#type' => 'textfield',
- '#title' => t('Regular expression for the accession'),'#required' => FALSE,
- '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
- '#weight' => 2
- );
-
- $sql = "SELECT * FROM {db} ORDER BY name";
- $db_rset = chado_query($sql);
- $dbs = array();
- $dbs[''] = '';
- while ($db = $db_rset->fetchObject()) {
- $dbs[$db->db_id] = "$db->name";
- }
- $form['advanced']['db']['db_id'] = array('#title' => t('External Database'),
- '#type' => t('select'),
- '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
- '#required' => FALSE,'#options' => $dbs,'#weight' => 1
- );
- $form['advanced']['relationship'] = array('#type' => 'fieldset','#title' => t('Relationships'),
- '#weight' => 6,'#collapsed' => TRUE
- );
- $rels = array();
- $rels[''] = '';
- $rels['part_of'] = 'part of';
- $rels['derives_from'] = 'produced by (derives from)';
-
- $form['advanced']['relationship']['rel_type'] = array('#title' => t('Relationship Type'),
- '#type' => t('select'),
- '#description' => t("Use this option to create associations, or relationships between the
- features of this FASTA file and existing features in the database. For
- example, to associate a FASTA file of peptides to existing genes or transcript sequence,
- select the type 'produced by'. For a CDS sequences select the type 'part of'"),
- '#required' => FALSE,'#options' => $rels,'#weight' => 5
- );
- $form['advanced']['relationship']['re_subject'] = array('#type' => 'textfield',
- '#title' => t('Regular expression for the parent'),'#required' => FALSE,
- '#description' => t('Enter the regular expression that will extract the unique
- name needed to identify the existing sequence for which the
- relationship type selected above will apply.'),'#weight' => 6
- );
- $form['advanced']['relationship']['parent_type'] = array('#type' => 'textfield',
- '#title' => t('Parent Type'),'#required' => FALSE,
- '#description' => t('Please enter the Sequence Ontology term for the parent. For example
- if the FASTA file being loaded is a set of proteins that are
- products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
- this type must match the type for already loaded features.'),
- '#weight' => 7
- );
- $form['button'] = array('#type' => 'submit','#value' => t('Import FASTA file'),'#weight' => 10
- );
- return $form;
- }
- function tripal_feature_fasta_load_form_validate($form, &$form_state) {
- $fasta_file = trim($form_state['values']['fasta_file']);
- $organism_id = $form_state['values']['organism_id'];
- $type = trim($form_state['values']['seqtype']);
- $method = trim($form_state['values']['method']);
- $match_type = trim($form_state['values']['match_type']);
- $re_name = trim($form_state['values']['re_name']);
- $re_uname = trim($form_state['values']['re_uname']);
- $re_accession = trim($form_state['values']['re_accession']);
- $db_id = $form_state['values']['db_id'];
- $rel_type = $form_state['values']['rel_type'];
- $re_subject = trim($form_state['values']['re_subject']);
- $parent_type = trim($form_state['values']['parent_type']);
- if ($method == 0) {
- $method = 'Insert only';
- }
- if ($method == 1) {
- $method = 'Update only';
- }
- if ($method == 2) {
- $method = 'Insert and update';
- }
- if ($match_type == 0) {
- $match_type = 'Name';
- }
- if ($match_type == 1) {
- $match_type = 'Unique name';
- }
- if ($re_name and !$re_uname and strcmp($match_type, 'Unique name') == 0) {
- form_set_error('re_uname', t("You must provide a regular expression to identify the sequence unique name"));
- }
- if (!$re_name and $re_uname and strcmp($match_type, 'Name') == 0) {
- form_set_error('re_name', t("You must provide a regular expression to identify the sequence name"));
- }
-
- $fasta_file = trim($fasta_file);
- $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
- if (!file_exists($dfile)) {
-
-
- $dfile = $fasta_file;
- }
- if (!file_exists($dfile)) {
- form_set_error('fasta_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
- }
-
- if (($rel_type or $parent_type) and !$re_subject) {
- form_set_error('re_subject', t("Please provide a regular expression for the parent"));
- }
- if (($rel_type or $re_subject) and !$parent_type) {
- form_set_error('parent_type', t("Please provide a SO term for the parent"));
- }
- if (($parent_type or $re_subject) and !$rel_type) {
- form_set_error('rel_type', t("Please select a relationship type"));
- }
-
- if ($db_id and !$re_accession) {
- form_set_error('re_accession', t("Please provide a regular expression for the accession"));
- }
- if ($re_accession and !$db_id) {
- form_set_error('db_id', t("Please select a database"));
- }
-
- $cvtermsql = "SELECT CVT.cvterm_id
- FROM {cvterm} CVT
- INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
- LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
- WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)";
- $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $type,
- ':synonym' => $type
- ))->fetchObject();
- if (!$cvterm) {
- form_set_error('type', t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
- }
- if ($rel_type) {
- $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $parent_type,
- ':synonym' => $parent_type
- ))->fetchObject();
- if (!$cvterm) {
- form_set_error('parent_type', t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
- }
- }
-
- $form_state['storage']['dfile'] = $dfile;
- }
- function tripal_feature_fasta_load_form_submit($form, &$form_state) {
- global $user;
- $dfile = $form_state['storage']['dfile'];
- $organism_id = $form_state['values']['organism_id'];
- $type = trim($form_state['values']['seqtype']);
- $method = trim($form_state['values']['method']);
- $match_type = trim($form_state['values']['match_type']);
- $re_name = trim($form_state['values']['re_name']);
- $re_uname = trim($form_state['values']['re_uname']);
- $re_accession = trim($form_state['values']['re_accession']);
- $db_id = $form_state['values']['db_id'];
- $rel_type = $form_state['values']['rel_type'];
- $re_subject = trim($form_state['values']['re_subject']);
- $parent_type = trim($form_state['values']['parent_type']);
- $analysis_id = $form_state['values']['analysis_id'];
- if ($method == 0) {
- $method = 'Insert only';
- }
- if ($method == 1) {
- $method = 'Update only';
- }
- if ($method == 2) {
- $method = 'Insert and update';
- }
- if ($match_type == 0) {
- $match_type = 'Name';
- }
- if ($match_type == 1) {
- $match_type = 'Unique name';
- }
- $args = array($dfile,$organism_id,$type,$re_name,$re_uname,$re_accession,$db_id,$rel_type,
- $re_subject,$parent_type,$method,$user->uid,$analysis_id,$match_type
- );
- $fname = preg_replace("/.*\/(.*)/", "$1", $dfile);
- $includes = array(
- module_load_include('inc', 'tripal_chado', 'includes/loaders/tripal_chado.fasta_loader'),
- );
- tripal_add_job("Import FASTA file: $fname", 'tripal_chado', 'tripal_feature_load_fasta', $args, $user->uid, 10, $includes);
- }
- function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession,
- $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type,
- $job = NULL) {
- $transaction = db_transaction();
- print "\nNOTE: Loading of this GFF file is performed using a database transaction. \n" .
- "If the load fails or is terminated prematurely then the entire set of \n" .
- "insertions/updates is rolled back and will not be found in the database\n\n";
- try {
-
- $cvtermsql = "
- SELECT CVT.cvterm_id
- FROM {cvterm} CVT
- INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
- LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
- WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
- ";
- $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $type,':synonym' => $type))->fetchObject();
- if (!$cvterm) {
- tripal_report_error("T_fasta_loader", TRIPAL_ERROR,
- "Cannot find the term type: '%type'", array('%type' => $type));
- return 0;
- }
-
- if ($parent_type) {
- $parentcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type,':synonym' => $parent_type))->fetchObject();
- if (!$parentcvterm) {
- tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the paretne term type: '%type'", array(
- '%type' => $parentcvterm
- ));
- return 0;
- }
- }
-
- if ($rel_type) {
- $relcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $rel_type,':synonym' => $rel_type))->fetchObject();
- if (!$relcvterm) {
- tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the relationship term type: '%type'", array(
- '%type' => $relcvterm
- ));
- return 0;
- }
- }
-
-
- $feature_tbl = chado_get_schema('feature');
- $dbxref_tbl = chado_get_schema('dbxref');
- print "Step 1: finding sequences\n";
- $filesize = filesize($dfile);
- $fh = fopen($dfile, 'r');
- if (!$fh) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "cannot open file: %dfile", array(
- '%dfile' => $dfile
- ));
- return 0;
- }
-
- $interval = intval($filesize * 0.01);
- if ($interval < 1) {
- $interval = 1;
- }
- $inv_read = 0;
- $num_read = 0;
-
-
- $seqs = array();
- $num_seqs = 0;
- $prev_pos = 0;
- $set_start = FALSE;
- while ($line = fgets($fh)) {
- $num_read += strlen($line);
- $intv_read += strlen($line);
-
-
- if (preg_match('/^>/', $line)) {
-
- $defline = preg_replace("/^>/", '', $line);
-
- if ($re_name) {
- if (!preg_match("/$re_name/", $defline, $matches)) {
- tripal_report_error('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array(
- '%line' => $i
- ), 'error');
- }
- elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
- tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array(
- '%line' => $i
- ), 'error');
- }
- else {
- $name = trim($matches[1]);
- }
- }
-
-
- elseif (strcmp($match_type, 'Name') == 0) {
- if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
- if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
- tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array(
- '%line' => $i), 'error');
- }
- else {
- $name = trim($matches[1]);
- }
- }
- else {
- tripal_report_error('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array(
- '%line' => $i), 'error');
- }
- }
-
- if ($re_uname) {
- if (!preg_match("/$re_uname/", $defline, $matches)) {
- tripal_report_error('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array(
- '%line' => $i), 'error');
- }
- $uname = trim($matches[1]);
- }
-
-
-
- elseif (strcmp($match_type, 'Unique name') == 0) {
- if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
- $uname = trim($matches[1]);
- }
- else {
- tripal_report_error('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array(
- '%line' => $i), 'error');
- }
- }
-
- preg_match("/$re_accession/", $defline, $matches);
- if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
- tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. " .
- "Cannot add cross reference. Line %line.", array('%line' => $i
- ), 'warning');
- }
- else {
- $accession = trim($matches[1]);
- }
-
- preg_match("/$re_subject/", $line, $matches);
- $subject = trim($matches[1]);
-
- $seqs[$num_seqs] = array(
- 'name' => $name,
- 'uname' => $uname,
- 'accession' => $accession,
- 'subject' => $subject,
- 'seq_start' => ftell($fh)
- );
- $set_start = TRUE;
-
-
- if ($num_seqs > 0) {
- $seqs[$num_seqs - 1]['seq_end'] = $prev_pos;
- }
- $num_seqs++;
- }
-
-
- $prev_pos = ftell($fh);
-
- if ($job and $intv_read >= $interval) {
- $intv_read = 0;
- $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
- if ($name) {
- print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
- " bytes.\r";
- }
- else {
- print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
- " bytes.\r";
- }
- tripal_set_job_progress($job, intval(($num_read / $filesize) * 100));
- }
- }
- $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
- print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
- " bytes.\r";
- tripal_set_job_progress($job, 50);
-
- $seqs[$num_seqs - 1]['seq_end'] = $num_read - strlen($line);
-
- print "\nStep 2: Importing sequences\n";
- for ($i = 0; $i < $num_seqs; $i++) {
- $seq = $seqs[$i];
- print "Importing " . ($i + 1) . " of $num_seqs. ";
- if ($name) {
- print "Current feature: " . $seq['name'] . ".\n";
- }
- else {
- print "Current feature: " . $seq['uname'] . ".\n";
- }
- tripal_feature_load_fasta_feature($fh, $seq['name'], $seq['uname'], $db_id, $seq['accession'], $seq['subject'], $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name, $match_type, $parentcvterm, $relcvterm, $seq['seq_start'], $seq['seq_end']);
- }
- tripal_set_job_progress($job, 100);
- fclose($fh);
- }
- catch (Exception $e) {
- fclose($fh);
- $transaction->rollback();
- print "\n";
- watchdog_exception('T_fasta_loader', $e);
- print "FAILED: Rolling back database changes...\n";
- }
- print "\nDone\n";
- }
- function tripal_feature_load_fasta_feature($fh, $name, $uname, $db_id, $accession, $parent,
- $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name,
- $match_type, $parentcvterm, $relcvterm, $seq_start, $seq_end) {
-
- if (strcmp($match_type, 'Name') == 0) {
- $values = array('organism_id' => $organism_id,'name' => $name,'type_id' => $cvterm->cvterm_id
- );
- $results = chado_select_record('feature', array('feature_id'
- ), $values);
- if (count($results) > 1) {
- tripal_report_error('T_fasta_loader', "Multiple features exist with the name '%name' of type
- '%type' for the organism. skipping", array('%name' => $name,'%type' => $type));
- return 0;
- }
- if (count($results) == 1) {
- $feature = $results[0];
- }
- }
-
- if (strcmp($match_type, 'Unique name') == 0) {
- $values = array(
- 'organism_id' => $organism_id,
- 'uniquename' => $uname,
- 'type_id' => $cvterm->cvterm_id
- );
- $results = chado_select_record('feature', array('feature_id'), $values);
- if (count($results) > 1) {
- tripal_report_error('T_fasta_loader', TRIPAL_WARNING, "Multiple features exist with the name '%name' of type '%type' for the organism. skipping", array(
- '%name' => $name,'%type' => $type));
- return 0;
- }
- if (count($results) == 1) {
- $feature = $results[0];
- }
-
- if ($feature and (strcmp($method, 'Insert only') == 0)) {
- tripal_report_error('T_fasta_loader', TRIPAL_WARNING, "Feature already exists '%name' ('%uname') while matching on %type. Skipping insert.", array(
- '%name' => $name,'%uname' => $uname,'%type' => drupal_strtolower($match_type)
- ));
- return 0;
- }
- }
-
- $inserted = 0;
- if (!$feature and (strcmp($method, 'Insert only') == 0 or strcmp($method, 'Insert and update') == 0)) {
-
- if (!$uname) {
- $uname = $name;
- }
- elseif (!$name) {
- $name = $uname;
- }
-
- $values = array(
- 'organism_id' => $organism_id,
- 'name' => $name,
- 'uniquename' => $uname,
- 'type_id' => $cvterm->cvterm_id
- );
- $success = chado_insert_record('feature', $values);
- if (!$success) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to insert feature '%name (%uname)'", array(
- '%name' => $name,'%uname' => $numane));
- return 0;
- }
-
- $values = array(
- 'organism_id' => $organism_id,
- 'uniquename' => $uname,
- 'type_id' => $cvterm->cvterm_id
- );
- $results = chado_select_record('feature', array('feature_id'), $values);
- if (count($results) == 1) {
- $inserted = 1;
- $feature = $results[0];
- }
- else {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to retreive newly inserted feature '%name (%uname)'", array(
- '%name' => $name,'%uname' => $numane));
- return 0;
- }
-
- tripal_feature_load_fasta_residues($fh, $feature->feature_id, $seq_start, $seq_end);
- }
-
- if (!$feature and (strcmp($method, 'Update only') == 0 or
- drupal_strcmp($method, 'Insert and update') == 0)) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to find feature '%name' ('%uname') while matching on " .
- drupal_strtolower($match_type), array('%name' => $name,'%uname' => $uname));
- return 0;
- }
-
- if ($feature and !$inserted and (strcmp($method, 'Update only') == 0 or
- strcmp($method, 'Insert and update') == 0)) {
-
- if (strcmp($match_type, 'Name') == 0) {
-
-
- $values = array();
- if ($uname) {
-
-
-
- $values = array(
- 'organism_id' => $organism_id,
- 'uniquename' => $uname,
- 'type_id' => $cvterm->cvterm_id
- );
- $results = chado_select_record('feature', array('feature_id'
- ), $values);
- if (count($results) > 0) {
- tripal_report_error('T_fasta_loader', "Cannot update the feature '%name' with a uniquename of '%uname' and type of '%type' as it
- conflicts with an existing feature with the same uniquename and type.", array(
- '%name' => $name,'%uname' => $uname,'%type' => $type
- ));
- return 0;
- }
-
- $values = array('uniquename' => $uname);
- $match = array(
- 'name' => $name,
- 'organism_id' => $organism_id,
- 'type_id' => $cvterm->cvterm_id
- );
-
- $success = chado_update_record('feature', $match, $values);
- if (!$success) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to update feature '%name' ('%name')", array(
- '%name' => $name,'%uiname' => $uname
- ));
- return 0;
- }
- }
- }
-
- if (strcmp($match_type, 'Unique name') == 0) {
-
-
- $values = array();
- if ($name) {
- $values = array('name' => $name);
- $match = array(
- 'uniquename' => $uname,
- 'organism_id' => $organism_id,
- 'type_id' => $cvterm->cvterm_id
- );
- $success = chado_update_record('feature', $match, $values);
- if (!$success) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to update feature '%name' ('%name')", array(
- '%name' => $name,'%uiname' => $uname
- ));
- return 0;
- }
- }
- }
- }
-
- tripal_feature_load_fasta_residues($fh, $feature->feature_id, $seq_start, $seq_end);
-
- if ($analysis_id) {
-
- $values = array(
- 'analysis_id' => $analysis_id,
- 'feature_id' => $feature->feature_id
- );
- $results = chado_select_record('analysisfeature', array('analysisfeature_id'), $values);
- if (count($results) == 0) {
- $success = chado_insert_record('analysisfeature', $values);
- if (!$success) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to associate analysis and feature '%name' ('%name')", array(
- '%name' => $name,'%uname' => $uname
- ));
- return 0;
- }
- }
- }
-
- if ($db_id) {
-
- $values = array(
- 'db_id' => $db_id,
- 'accession' => $accession
- );
- $results = chado_select_record('dbxref', array('dbxref_id'), $values);
-
- if (count($results) == 0) {
- $results = chado_insert_record('dbxref', $values);
- if (!$results) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add database accession '%accession'", array(
- '%accession' => $accession));
- return 0;
- }
- $results = chado_select_record('dbxref', array('dbxref_id'), $values);
- if (count($results) == 1) {
- $dbxref = $results[0];
- }
- else {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to retreive newly inserted dbxref '%name (%uname)'", array(
- '%name' => $name,'%uname' => $numane));
- return 0;
- }
- }
- else {
- $dbxref = $results[0];
- }
-
- $values = array(
- 'feature_id' => $feature->feature_id,
- 'dbxref_id' => $dbxref->dbxref_id
- );
- $results = chado_select_record('feature_dbxref', array('feature_dbxref_id'), $values);
- if (count($results) == 0) {
- $success = chado_insert_record('feature_dbxref', $values);
- if (!$success) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add associate database accession '%accession' with feature", array(
- '%accession' => $accession
- ));
- return 0;
- }
- }
- }
-
- if ($rel_type) {
- $values = array('organism_id' => $organism_id,'uniquename' => $parent,
- 'type_id' => $parentcvterm->cvterm_id
- );
- $results = chado_select_record('feature', array('feature_id'
- ), $values);
- if (count($results) != 1) {
- tripal_report_error('T_fasta_loader', "Cannot find a unique feature for the parent '%parent' of type
- '%type' for the feature.", array(
- '%parent' => $parent,'%type' => $parent_type
- ));
- return 0;
- }
- $parent_feature = $results[0];
-
- $values = array(
- 'subject_id' => $feature->feature_id,
- 'object_id' => $parent_feature->feature_id,
- 'type_id' => $relcvterm->cvterm_id
- );
- $results = chado_select_record('feature_relationship', array('feature_relationship_id'), $values);
- if (count($results) == 0) {
- $success = chado_insert_record('feature_relationship', $values);
- if (!$success) {
- tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add associate database accession '%accession' with feature", array(
- '%accession' => $accession
- ));
- return 0;
- }
- }
- }
- }
- function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_end) {
-
- fseek($fh, $seq_start, SEEK_SET);
- $chunk_size = 100000000;
- $chunk = '';
- $seqlen = ($seq_end - $seq_start) + 1;
-
- $interval = intval($seqlen * 0.01);
- if ($interval < 1) {
- $interval = 1;
- }
-
-
- if ($interval < 100000) {
- $interval = 100000;
- }
- $chunk_intv_read = 0;
- $intv_read = 0;
- $num_read = 0;
- $total_seq_size = 0;
-
- $sql = "UPDATE {feature} SET residues = '' WHERE feature_id = :feature_id";
- chado_query($sql, array(':feature_id' => $feature_id
- ));
-
-
-
- print "Sequence complete: 0%. Memory: " . number_format(memory_get_usage()) . " bytes. \r";
- while ($line = fgets($fh)) {
- $num_read += strlen($line) + 1;
- $chunk_intv_read += strlen($line) + 1;
- $intv_read += strlen($line) + 1;
- $chunk .= trim($line);
-
- if ($chunk_intv_read >= $chunk_size) {
- $sql = "
- UPDATE {feature}
- SET residues = residues || :chunk
- WHERE feature_id = :feature_id
- ";
- $success = chado_query($sql, array(':feature_id' => $feature_id,':chunk' => $chunk
- ));
- if (!$success) {
- return FALSE;
- }
- $total_seq_size += strlen($chunk);
- $chunk = '';
- $chunk_intv_read = 0;
- }
- if ($intv_read >= $interval) {
- $percent = sprintf("%.2f", ($total_seq_size / $seqlen) * 100);
- print "Sequence complete: " . $percent . "%. Memory: " . number_format(memory_get_usage()) .
- " bytes. \r";
- $intv_read = 0;
- }
-
- if (ftell($fh) == $seq_end) {
- break;
- }
- }
-
- if (strlen($chunk) > 0) {
- $sql = "
- UPDATE {feature}
- SET residues = residues || :chunk
- WHERE feature_id = :feature_id
- ";
- $success = chado_query($sql, array(':feature_id' => $feature_id,':chunk' => $chunk
- ));
- if (!$success) {
- return FALSE;
- }
- $total_seq_size += strlen($chunk);
- $chunk = '';
- $chunk_intv_read = 0;
- }
-
- $sql = "UPDATE {feature} SET seqlen = char_length(residues), md5checksum = md5(residues) WHERE feature_id = :feature_id";
- chado_query($sql, array(':feature_id' => $feature_id
- ));
- $percent = sprintf("%.2f", ($num_read / $seqlen) * 100);
- print "Sequence complete: " . $percent . "%. Memory: " . number_format(memory_get_usage()) .
- " bytes. \r";
- }
|