1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266 |
- <?php
- class GFF3Importer extends TripalImporter {
-
- public static $name = 'Chado GFF3 File Loader';
-
- public static $machine_name = 'chado_gff3_loader';
-
- public static $description = 'Import a GFF3 file into Chado';
-
- public static $file_types = ['gff', 'gff3'];
-
- public static $upload_description = 'Please provide the GFF3 file.';
-
- public static $upload_title = 'GFF3 File';
-
- public static $button_text = 'Import GFF3 file';
-
- public function form($form, &$form_state) {
-
- $sql = "SELECT * FROM {organism} ORDER BY genus, species";
- $org_rset = chado_query($sql);
- $organisms = [];
- $organisms[''] = '';
- while ($organism = $org_rset->fetchObject()) {
- $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
- }
- $form['organism_id'] = [
- '#title' => t('Organism'),
- '#type' => t('select'),
- '#description' => t("Choose the organism to which these sequences are associated"),
- '#required' => TRUE,
- '#options' => $organisms,
- ];
-
- $form['advanced'] = [
- '#type' => 'fieldset',
- '#title' => t('Additional Options'),
- '#collapsible' => TRUE,
- '#collapsed' => TRUE,
- ];
- $form['advanced']['line_number'] = [
- '#type' => 'textfield',
- '#title' => t('Start Line Number'),
- '#description' => t('Enter the line number in the GFF file where you would like to begin processing. The
- first line is line number 1. This option is useful for examining loading problems with large GFF files.'),
- '#size' => 10,
- ];
- $form['advanced']['landmark_type'] = [
- '#title' => t('Landmark Type'),
- '#type' => t('textfield'),
- '#description' => t("Optional. Use this field to specify a Sequence Ontology type
- for the landmark sequences in the GFF fie (e.g. 'chromosome'). If the GFF file
- contains a '##sequence-region' line that describes the landmark sequences to
- which all others are aligned and a type is provided here then the features
- will be created if they do not already exist. If they do exist then this
- field is not used."),
- ];
- $form['advanced']['alt_id_attr'] = [
- '#title' => t('ID Attribute'),
- '#type' => t('textfield'),
- '#description' => t("Optional. Sometimes lines in the GFF file are missing the
- required ID attribute that specifies the unique name of the feature, but there
- may be another attribute that can uniquely identify the feature. If so,
- you may specify the name of the attribute to use for the name."),
- ];
- $form['advanced']['skip_protein'] = [
- '#type' => 'checkbox',
- '#title' => t('Skip automatic protein creation'),
- '#required' => FALSE,
- '#description' => t('The GFF loader will automatically create a protein feature for each transcript in the GFF file if a protein feature is missing in the GFF file. Check this box to disable this functionality. Protein features that are specifically present in the GFF will always be created.'),
- '#default_value' => 0,
- ];
- $form['advanced']['protein_names'] = [
- '#type' => 'fieldset',
- '#title' => t('Protein Names'),
- '#collapsible' => TRUE,
- '#collapsed' => FALSE,
- '#weight' => 5,
- ];
- $form['advanced']['protein_names']['re_help'] = [
- '#type' => 'item',
- '#markup' => t('A regular expression is an advanced method for extracting information from a string of text.
- If your GFF3 file does not contain polypeptide (or protein) features, but contains CDS features, proteins will be automatically created.
- By default the loader will give each protein a name based on the name of the corresponding mRNA followed by the "-protein" suffix.
- If you want to customize the name of the created protein, you can use the following regex.'),
- ];
- $form['advanced']['protein_names']['re_mrna'] = [
- '#type' => 'textfield',
- '#title' => t('Regular expression for the mRNA name'),
- '#required' => FALSE,
- '#description' => t('Enter the regular expression that will extract portions of
- the mRNA unique name. For example, for a
- mRNA with a unique name finishing by -RX (e.g. SPECIES0000001-RA),
- the regular expression would be, "^(.*?)-R([A-Z]+)$".'),
- ];
- $form['advanced']['protein_names']['re_protein'] = [
- '#type' => 'textfield',
- '#title' => t('Replacement string for the protein name'),
- '#required' => FALSE,
- '#description' => t('Enter the replacement string that will be used to create
- the protein name based on the mRNA regular expression. For example, for a
- mRNA regular expression "^(.*?)-R()[A-Z]+)$", the corresponding protein regular
- expression would be "$1-P$2".'),
- ];
- $form['advanced']['use_transaction'] = [
- '#type' => 'checkbox',
- '#title' => t('Use a transaction'),
- '#required' => FALSE,
- '#description' => t('Use a database transaction when loading the GFF file. If an error occurs
- the entire datset loaded prior to the failure will be rolled back and will not be available
- in the database. If this option is unchecked and failure occurs all records up to the point
- of failure will be present in the database.'),
- '#default_value' => 1,
- ];
- $form['advanced']['add_only'] = [
- '#type' => 'checkbox',
- '#title' => t('Import only new features'),
- '#required' => FALSE,
- '#description' => t('The job will skip features in the GFF file that already
- exist in the database and import only new features.'),
- ];
- $form['advanced']['update'] = [
- '#type' => 'checkbox',
- '#title' => t('Import all and update'),
- '#required' => FALSE,
- '#default_value' => 'checked',
- '#description' => t('Existing features will be updated and new features will be added. Attributes
- for a feature that are not present in the GFF but which are present in the
- database will not be altered.'),
- '#default_value' => 1,
- ];
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- $form['advanced']['create_organism'] = [
- '#type' => 'checkbox',
- '#title' => t('Create organism'),
- '#required' => FALSE,
- '#description' => t('The Tripal GFF loader supports the "organism" attribute. This allows features of a
- different organism to be aligned to the landmark sequence of another species. The format of the
- attribute is "organism=[genus]:[species]", where [genus] is the organism\'s genus and [species] is the
- species name. Check this box to automatically add the organism to the database if it does not already exists.
- Otherwise lines with an oraganism attribute where the organism is not present in the database will be skipped.'),
- ];
- $form['advanced']['targets'] = [
- '#type' => 'fieldset',
- '#title' => t('Targets'),
- '#collapsible' => TRUE,
- '#collapsed' => FALSE,
- '#weight' => 1,
- ];
- $form['advanced']['targets']['adesc'] = [
- '#markup' => t("When alignments are represented in the GFF file (e.g. such as
- alignments of cDNA sequences to a whole genome, or blast matches), they are
- represented using two feature types: 'match' (or cDNA_match, EST_match, etc.)
- and 'match_part'. These features may also have a 'Target' attribute to
- specify the sequence that is being aligned.
- However, the organism to which the aligned sequence belongs may not be present in the
- GFF file. Here you can specify the organism and feature type of the target sequences.
- The options here will apply to all targets unless the organism and type are explicity
- set in the GFF file using the 'target_organism' and 'target_type' attributes."),
- ];
- $form['advanced']['targets']['target_organism_id'] = [
- '#title' => t('Target Organism'),
- '#type' => t('select'),
- '#description' => t("Optional. Choose the organism to which target sequences belong.
- Select this only if target sequences belong to a different organism than the
- one specified above. And only choose an organism here if all of the target sequences
- belong to the same species. If the targets in the GFF file belong to multiple
- different species then the organism must be specified using the 'target_organism=genus:species'
- attribute in the GFF file."),
- '#options' => $organisms,
- ];
- $form['advanced']['targets']['target_type'] = [
- '#title' => t('Target Type'),
- '#type' => t('textfield'),
- '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein
- and an mRNA have the same name) then you must specify the type for all targets in the GFF file. If
- the targets are of different types then the type must be specified using the 'target_type=type' attribute
- in the GFF file. This must be a valid Sequence Ontology (SO) term."),
- ];
- $form['advanced']['targets']['create_target'] = [
- '#type' => 'checkbox',
- '#title' => t('Create Target'),
- '#required' => FALSE,
- '#description' => t("If the target feature cannot be found, create one using the organism and type specified above, or
- using the 'target_organism' and 'target_type' fields specified in the GFF file. Values specified in the
- GFF file take precedence over those specified above."),
- ];
- return $form;
- }
-
- public function formValidate($form, &$form_state) {
- $organism_id = $form_state['values']['organism_id'];
- $target_organism_id = $form_state['values']['target_organism_id'];
- $target_type = trim($form_state['values']['target_type']);
- $create_target = $form_state['values']['create_target'];
- $create_organism = $form_state['values']['create_organism'];
- $add_only = $form_state['values']['add_only'];
- $update = $form_state['values']['update'];
- $refresh = 0;
- $remove = 0;
- $use_transaction = $form_state['values']['use_transaction'];
- $line_number = trim($form_state['values']['line_number']);
- $landmark_type = trim($form_state['values']['landmark_type']);
- $alt_id_attr = trim($form_state['values']['alt_id_attr']);
- $re_mrna = trim($form_state['values']['re_mrna']);
- $re_protein = trim($form_state['values']['re_protein']);
-
- if (($add_only AND ($update OR $refresh OR $remove)) OR
- ($update AND ($add_only OR $refresh OR $remove)) OR
- ($refresh AND ($update OR $add_only OR $remove)) OR
- ($remove AND ($update OR $refresh OR $add_only))) {
- form_set_error('add_only', t("Please select only one checkbox from the import options section"));
- }
- if ($line_number and !is_numeric($line_number) or $line_number < 0) {
- form_set_error('line_number', t("Please provide an integer line number greater than zero."));
- }
- if (!($re_mrna and $re_protein) and ($re_mrna or $re_protein)) {
- form_set_error('re_uname', t("You must provide both a regular expression for mRNA and a replacement string for protein"));
- }
-
- set_error_handler(function () {
- }, E_WARNING);
- $result_re = preg_match("/" . $re_mrna . "/", NULL);
- $result = preg_replace("/" . $re_mrna . "/", $re_protein, NULL);
- restore_error_handler();
- if ($result_re === FALSE) {
- form_set_error('re_mrna', 'Invalid regular expression.');
- }
- else {
- if ($result === FALSE) {
- form_set_error('re_protein', 'Invalid replacement string.');
- }
- }
- }
-
- public function run() {
- $arguments = $this->arguments['run_args'];
- $file_path = $this->arguments['files'][0]['file_path'];
- $organism_id = $arguments['organism_id'];
- $analysis_id = $arguments['analysis_id'];
- $add_only = $arguments['add_only'];
- $update = $arguments['update'];
- $refresh = FALSE;
- $remove = FALSE;
- $use_transaction = $arguments['use_transaction'];
- $target_organism_id = $arguments['target_organism_id'];
- $target_type = $arguments['target_type'];
- $create_target = $arguments['create_target'];
- $start_line = $arguments['line_number'];
- $landmark_type = $arguments['landmark_type'];
- $alt_id_attr = $arguments['alt_id_attr'];
- $create_organism = $arguments['create_organism'];
- $re_mrna = $arguments['re_mrna'];
- $re_protein = $arguments['re_protein'];
- $skip_protein = $arguments['skip_protein'];
- $this->loadGFF3($file_path, $organism_id, $analysis_id,
- $add_only, $update, $refresh, $remove, $use_transaction,
- $target_organism_id, $target_type, $create_target,
- $start_line, $landmark_type, $alt_id_attr, $create_organism,
- $re_mrna, $re_protein, $skip_protein);
- }
-
- private function loadGFF3($gff_file, $organism_id, $analysis_id,
- $add_only = 0, $update = 1, $refresh = 0, $remove = 0, $use_transaction = 1,
- $target_organism_id = NULL, $target_type = NULL, $create_target = 0,
- $start_line = 1, $landmark_type = '', $alt_id_attr = '', $create_organism = FALSE,
- $re_mrna = '', $re_protein = '', $skip_protein = 0) {
- $ret = [];
- $date = getdate();
-
-
- $cvterm_lookup = [];
-
-
- $landmark_lookup = [];
-
- $sql = "DELETE FROM {tripal_gff_temp}";
- chado_query($sql);
- $sql = "DELETE FROM {tripal_gffcds_temp}";
- chado_query($sql);
- $sql = "DELETE FROM {tripal_gffprotein_temp}";
- chado_query($sql);
-
- $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file;
- if (!file_exists($dfile)) {
-
-
- $dfile = $gff_file;
- }
- if (!file_exists($dfile)) {
- throw new Exception(t("Cannot find the file: !dfile", ['!dfile' => $dfile]));
- }
- $this->logMessage("Opening !gff_file", ['!gff_file' => $gff_file]);
-
- $fh = fopen($dfile, 'r');
- if (!$fh) {
- throw new Exception(t("Cannot open file: !dfile", ['!dfile' => $dfile]));
- }
- $filesize = filesize($dfile);
- $this->setTotalItems($filesize);
-
-
- $sql = "SELECT * FROM {cv} WHERE name = :cvname";
- $cv = chado_query($sql, [':cvname' => 'sequence'])->fetchObject();
- if (!$cv) {
- throw new Exception(t("Cannot find the 'sequence' ontology", []));
- }
-
- $sql = "SELECT * FROM {organism} WHERE organism_id = :organism_id";
- $organism = chado_query($sql, [':organism_id' => $organism_id])->fetchObject();
- $in_fasta = 0;
- $line_num = 0;
- $num_read = 0;
-
- $sel_cvterm_sql = "
- SELECT CVT.cvterm_id, CVT.cv_id, CVT.name, CVT.definition,
- CVT.dbxref_id, CVT.is_obsolete, CVT.is_relationshiptype
- FROM {cvterm} CVT
- INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
- LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
- WHERE CV.cv_id = :cv_id and
- (lower(CVT.name) = lower(:name) or lower(CVTS.synonym) = lower(:synonym))
- ";
-
- if ($landmark_type) {
- $query = [
- ':cv_id' => $cv->cv_id,
- ':name' => $landmark_type,
- ':synonym' => $landmark_type,
- ];
- $result = chado_query($sel_cvterm_sql, $query);
- $landmark_cvterm = $result->fetchObject();
- if (!$landmark_cvterm) {
- throw new Exception(t('Cannot find landmark feature type \'%landmark_type\'.', ['%landmark_type' => $landmark_type]));
- }
- }
-
- while ($line = fgets($fh)) {
- $line_num++;
- $size = drupal_strlen($line);
- $this->addItemsHandled($size);
- $num_read += $size;
- if ($line_num < $start_line) {
- continue;
- }
-
-
- if (preg_match('/^##FASTA/i', $line)) {
- $this->logMessage("Parsing FASTA portion...");
- if ($remove) {
-
- break;
- }
- $this->loadFasta($fh, $interval, $num_read, $line_num, $filesize);
- continue;
- }
-
- if (preg_match('/^##sequence-region (.*?) (\d+) (\d+)$/i', $line, $region_matches)) {
- $rid = $region_matches[1];
- $rstart = $region_matches[2];
- $rend = $region_matches[3];
- if ($landmark_type) {
- $this->loadFeature($organism, $analysis_id, $landmark_cvterm, $rid,
- $rid, '', 'f', 'f', 1, 0);
- }
- continue;
- }
-
- if (preg_match('/^#/', $line)) {
- continue;
- }
-
- if (preg_match('/^\s*$/', $line)) {
- continue;
- }
-
- $cols = explode("\t", $line);
- if (sizeof($cols) != 9) {
- throw new Exception(t('Improper number of columns on line %line_num', ['%line_num' => $line_num]));
- }
-
- $landmark = $cols[0];
- $source = $cols[1];
- $type = $cols[2];
- $start = $cols[3];
- $end = $cols[4];
- $score = $cols[5];
- $strand = $cols[6];
- $phase = $cols[7];
- $attrs = explode(";", $cols[8]);
-
-
- $fmin = $start - 1;
- $fmax = $end;
- if ($end < $start) {
- $fmin = $end - 1;
- $fmax = $start;
- }
-
- if (strcmp($strand, '.') == 0) {
- $strand = 0;
- }
- elseif (strcmp($strand, '+') == 0) {
- $strand = 1;
- }
- elseif (strcmp($strand, '-') == 0) {
- $strand = -1;
- }
- if (strcmp($phase, '.') == 0) {
- if ($type == 'CDS') {
- $phase = '0';
- }
- else {
- $phase = '';
- }
- }
- if (array_key_exists($type, $cvterm_lookup)) {
- $cvterm = $cvterm_lookup[$type];
- }
- else {
- $result = chado_query($sel_cvterm_sql, [
- ':cv_id' => $cv->cv_id,
- ':name' => $type,
- ':synonym' => $type,
- ]);
- $cvterm = $result->fetchObject();
- $cvterm_lookup[$type] = $cvterm;
- if (!$cvterm) {
- throw new Exception(t('Cannot find feature term \'%type\' on line %line_num of the GFF file',
- ['%type' => $type, '%line_num' => $line_num]));
- }
- }
-
- $tags = [];
- $attr_name = '';
- $attr_uniquename = '';
- $attr_residue_info = '';
- $attr_locgroup = 0;
- $attr_fmin_partial = 'f';
- $attr_fmax_partial = 'f';
- $attr_is_obsolete = 'f';
- $attr_is_analysis = 'f';
- $attr_others = [];
- $residues = '';
-
-
-
- $attr_organism = '';
- $feature_organism = $organism;
- foreach ($attrs as $attr) {
- $attr = rtrim($attr);
- $attr = ltrim($attr);
- if (strcmp($attr, '') == 0) {
- continue;
- }
- if (!preg_match('/^[^\=]+\=.+$/', $attr)) {
- throw new Exception(t('Attribute is not correctly formatted on line %line_num: %attr',
- ['%line_num' => $line_num, '%attr' => $attr]));
- }
-
- $tag = preg_split("/=/", $attr, 2);
-
- $tag_name = $tag[0];
- if (!array_key_exists($tag_name, $tags)) {
- $tags[$tag_name] = [];
- }
- $tags[$tag_name] = array_merge($tags[$tag_name], explode(",", $tag[1]));
-
- for ($i = 0; $i < count($tags[$tag_name]); $i++) {
- $tags[$tag_name][$i] = urldecode($tags[$tag_name][$i]);
- }
-
- $skip_feature = 0;
- if (strcmp($tag_name, 'ID') == 0) {
- $attr_uniquename = urldecode($tag[1]);
- }
- elseif (strcmp($tag_name, 'Name') == 0) {
- $attr_name = urldecode($tag[1]);
- }
- elseif (strcmp($tag_name, 'organism') == 0) {
- $attr_organism = urldecode($tag[1]);
- $org_matches = [];
- if (preg_match('/^(.*?):(.*?)$/', $attr_organism, $org_matches)) {
- $values = [
- 'genus' => $org_matches[1],
- 'species' => $org_matches[2],
- ];
- $org = chado_select_record('organism', ["*"], $values);
- if (count($org) == 0) {
- if ($create_organism) {
- $feature_organism = (object) chado_insert_record('organism', $values);
- if (!$feature_organism) {
- $this->logMessage("Could not add the organism, '%org', from line %line. Skipping this line.",
- [
- '%org' => $attr_organism,
- '%line' => $line_num,
- ], TRIPAL_ERROR);
- $skip_feature = 1;
- }
- }
- else {
- $this->logMessage("The organism attribute '%org' on line %line does not exist. Skipping this line.",
- [
- '%org' => $attr_organism,
- '%line' => $line_num,
- ], TRIPAL_ERROR);
- $skip_feature = 1;
- }
- }
- else {
-
- $feature_organism = $org[0];
- }
- }
- else {
- $this->logMessage("The organism attribute '%org' on line %line is not properly formated. It " .
- "should be of the form: organism=Genus:species. Skipping this line.",
- ['%org' => $attr_organism, '%line' => $line_num], TRIPAL_ERROR);
- $skip_feature = 1;
- }
- }
-
- elseif (strcmp($tag_name, 'Alias') != 0 and strcmp($tag_name, 'Parent') != 0 and
- strcmp($tag_name, 'Target') != 0 and strcmp($tag_name, 'Gap') != 0 and
- strcmp($tag_name, 'Derives_from') != 0 and strcmp($tag_name, 'Note') != 0 and
- strcmp($tag_name, 'Dbxref') != 0 and strcmp($tag_name, 'Ontology_term') != 0 and
- strcmp($tag_name, 'Is_circular') != 0 and strcmp($tag_name, 'target_organism') != 0 and
- strcmp($tag_name, 'target_type') != 0 and strcmp($tag_name, 'organism' != 0)) {
- foreach ($tags[$tag_name] as $value) {
- $attr_others[$tag_name][] = $value;
- }
- }
- }
-
- if (!$attr_uniquename and !$attr_name) {
-
-
- if (array_key_exists($alt_id_attr, $tags)) {
- $attr_uniquename = $tags[$alt_id_attr][0];
- $attr_name = $attr_uniquename;
- }
-
-
-
- elseif (array_key_exists('Parent', $tags)) {
- $attr_uniquename = $tags['Parent'][0] . "-$type-$landmark-" . $date[0] . ":" . ($fmin + 1) . ".." . $fmax;
- $attr_name = $attr_uniquename;
- }
-
-
- else {
- $attr_uniquename = $date[0] . "-$type-$landmark:" . ($fmin + 1) . ".." . $fmax;
- $attr_name = $type;
- }
- }
-
- if (strcmp($attr_name, '') == 0) {
- $attr_name = $attr_uniquename;
- }
-
-
-
- if (!$attr_uniquename) {
- $attr_uniquename = $attr_name . '-' . $date[0] . '-' . $line_num;
- }
-
-
-
-
-
- if (!$remove and !(strcmp($landmark, $attr_uniquename) == 0 or strcmp($landmark, $attr_name) == 0) and !in_array($landmark, $landmark_lookup)) {
- $select = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $landmark,
- ];
- $columns = ['count(*) as num_landmarks'];
- if ($landmark_type) {
- $select['type_id'] = [
- 'name' => $landmark_type,
- ];
- }
- $count = chado_select_record('feature', $columns, $select);
- if (!$count or count($count) == 0 or $count[0]->num_landmarks == 0) {
-
- $select = [
- 'organism_id' => $organism->organism_id,
- 'name' => $landmark,
- ];
- $columns = ['count(*) as num_landmarks'];
- if ($landmark_type) {
- $select['type_id'] = [
- 'name' => $landmark_type,
- ];
- }
- $count = chado_select_record('feature', $columns, $select);
- if (!$count or count($count) == 0 or $count[0]->num_landmarks == 0) {
- throw new Exception(t("The landmark '%landmark' cannot be found for this organism (%species) " .
- "Please add the landmark and then retry the import of this GFF3 " .
- "file", [
- '%landmark' => $landmark,
- '%species' => $organism->genus . " " . $organism->species,
- ]));
- }
- elseif ($count[0]->num_landmarks > 1) {
- throw new Exception(t("The landmark '%landmark' has more than one entry for this organism (%species) " .
- "Cannot continue", [
- '%landmark' => $landmark,
- '%species' => $organism->genus . " " . $organism->species,
- ]));
- }
- }
- if ($count[0]->num_landmarks > 1) {
- throw new Exception(t("The landmark '%landmark' is not unique for this organism. " .
- "The features cannot be associated", ['%landmark' => $landmark]));
- }
-
- $landmark_lookup[] = $landmark;
- }
-
-
- if ($update or $refresh or $add_only) {
-
- $feature = $this->loadFeature($feature_organism, $analysis_id, $cvterm,
- $attr_uniquename, $attr_name, $residues, $attr_is_analysis,
- $attr_is_obsolete, $add_only, $score);
- if ($feature) {
-
-
- $values = [
- 'feature_id' => $feature->feature_id,
- 'organism_id' => $feature->organism_id,
- 'type_name' => $type,
- 'uniquename' => $feature->uniquename,
- ];
-
- $results = chado_select_record('tripal_gff_temp', ['*'], $values);
- if (count($results) == 0) {
- $result = chado_insert_record('tripal_gff_temp', $values);
- if (!$result) {
- throw new Exception(t("Cound not save record in temporary table, Cannot continue.", []));
- }
- }
-
-
-
- if (strcmp($landmark, $attr_uniquename) != 0) {
- $this->loadFeatureLoc($feature, $organism,
- $landmark, $fmin, $fmax, $strand, $phase, $attr_fmin_partial,
- $attr_fmax_partial, $attr_residue_info, $attr_locgroup);
- }
-
- if (array_key_exists('Alias', $tags)) {
- $this->loadAlias($feature, $tags['Alias']);
- }
-
- if (array_key_exists('Dbxref', $tags)) {
- $this->loadDbxref($feature, $tags['Dbxref']);
- }
-
- if (array_key_exists('Ontology_term', $tags)) {
- $this->loadOntology($feature, $tags['Ontology_term']);
- }
-
- if (array_key_exists('Parent', $tags)) {
- $this->loadParents($feature, $cvterm, $tags['Parent'],
- $feature_organism->organism_id, $strand, $phase, $fmin, $fmax);
- }
-
- if (array_key_exists('Target', $tags)) {
- $this->loadTarget($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup);
- }
-
- if (array_key_exists('Gap', $tags)) {
- foreach ($tags['Gap'] as $value) {
- $this->loadProperty($feature, 'Gap', $value);
- }
- }
-
- if (array_key_exists('Note', $tags)) {
- foreach ($tags['Note'] as $value) {
- $this->loadProperty($feature, 'Note', $value);
- }
- }
-
- if (array_key_exists('Derives_from', $tags)) {
- $this->loadDerivesFrom($feature, $cvterm, $tags['Derives_from'][0],
- $feature_organism, $fmin, $fmax);
- }
-
-
- $source_ref = ['GFF_source:' . $source];
- $this->loadDbxref($feature, $source_ref);
-
- if ($attr_others) {
- foreach ($attr_others as $tag_name => $values) {
- foreach ($values as $value) {
- $this->loadProperty($feature, $tag_name, $value);
- }
- }
- }
- }
- }
- }
-
- if (!$remove) {
-
- $sql = "SELECT feature_id FROM {tripal_gffcds_temp} LIMIT 1 OFFSET 1";
- $has_cds = chado_query($sql)->fetchField();
- if ($has_cds) {
- $this->logMessage("\nAdding protein sequences if CDS exist and no proteins in GFF...");
- $sql = "
- SELECT F.feature_id, F.name, F.uniquename, TGCT.strand,
- CVT.cvterm_id, CVT.name as feature_type,
- min(TGCT.fmin) as fmin, max(TGCT.fmax) as fmax,
- TGPT.feature_id as protein_id, TGPT.fmin as protein_fmin,
- TGPT.fmax as protein_fmax, FLM.uniquename as landmark
- FROM {tripal_gffcds_temp} TGCT
- INNER JOIN {feature} F on F.feature_id = TGCT.parent_id
- INNER JOIN {cvterm} CVT on CVT.cvterm_id = F.type_id
- INNER JOIN {featureloc} L on F.feature_id = L.feature_id
- INNER JOIN {feature} FLM on L.srcfeature_id = FLM.feature_id
- LEFT JOIN {tripal_gffprotein_temp} TGPT on TGPT.parent_id = F.feature_id
- GROUP BY F.feature_id, F.name, F.uniquename, CVT.cvterm_id, CVT.name,
- TGPT.feature_id, TGPT.fmin, TGPT.fmax, TGCT.strand, FLM.uniquename
- ";
- $results = chado_query($sql);
- $protein_cvterm = chado_get_cvterm([
- 'name' => 'polypeptide',
- 'cv_id' => [
- 'name' => 'sequence',
- ],
- ]);
- while ($result = $results->fetchObject()) {
-
-
- if (!$result->protein_id) {
-
- if ($re_mrna and $re_protein) {
-
- $uname = preg_replace("/$re_mrna/", $re_protein, $result->uniquename);
- $name = $result->name;
- }
- else {
-
- $uname = $result->uniquename . '-protein';
- $name = $result->name;
- }
- $values = [
- 'parent_id' => $result->feature_id,
- 'fmin' => $result->fmin,
- ];
- $min_phase = chado_select_record('tripal_gffcds_temp', ['phase'], $values);
- $values = [
- 'parent_id' => $result->feature_id,
- 'fmax' => $result->fmax,
- ];
- $max_phase = chado_select_record('tripal_gffcds_temp', ['phase'], $values);
- $pfmin = $result->fmin;
- $pfmax = $result->fmax;
- if ($result->strand == '-1') {
- $pfmax -= $max_phase[0]->phase;
- }
- else {
- $pfmin += $min_phase[0]->phase;
- }
- if ($skip_protein == 0) {
-
- $feature = $this->loadFeature($organism, $analysis_id,
- $protein_cvterm, $uname, $name, '', 'f', 'f', 1, 0);
-
- $cvterm = chado_get_cvterm(['cvterm_id' => $result->cvterm_id]);
- $this->loadDerivesFrom($feature, $cvterm,
- $result->uniquename, $organism, $pfmin, $pfmax);
-
-
- $this->loadFeatureLoc($feature, $organism, $result->landmark,
- $pfmin, $pfmax, $result->strand, '', 'f', 'f', '', 0);
- }
- }
- }
- }
- $this->logMessage("Setting ranks of children...");
-
- $sql = "
- SELECT DISTINCT F.feature_id, F.organism_id, F.type_id,
- F.uniquename, FL.strand
- FROM {tripal_gff_temp} TGT
- INNER JOIN {feature} F ON TGT.feature_id = F.feature_id
- INNER JOIN {feature_relationship} FR ON FR.object_id = TGT.feature_id
- INNER JOIN {cvterm} CVT ON CVT.cvterm_id = FR.type_id
- INNER JOIN {featureloc} FL ON FL.feature_id = F.feature_id
- WHERE CVT.name = 'part_of'
- ";
- $parents = chado_query($sql);
-
- $sel_gffchildren_sql = "
- SELECT DISTINCT FR.feature_relationship_id, FL.fmin, FR.rank
- FROM {feature_relationship} FR
- INNER JOIN {featureloc} FL on FL.feature_id = FR.subject_id
- INNER JOIN {cvterm} CVT on CVT.cvterm_id = FR.type_id
- WHERE FR.object_id = :feature_id AND CVT.name = 'part_of'
- ORDER BY FL.fmin ASC
- ";
-
-
-
-
- $num_recs = $parents->rowCount();
- $i = 1;
- while ($parent = $parents->fetchObject()) {
-
- $result = chado_query($sel_gffchildren_sql, [':feature_id' => $parent->feature_id]);
-
- $children = [];
- while ($child = $result->fetchObject()) {
- $children[] = $child;
- }
-
-
-
- if ($parent->strand == -1) {
- arsort($children);
- }
-
-
- $rank = -1;
- foreach ($children as $child) {
- $match = ['feature_relationship_id' => $child->feature_relationship_id];
- $values = ['rank' => $rank];
- chado_update_record('feature_relationship', $match, $values);
- $rank--;
- }
-
- $rank = 0;
- foreach ($children as $child) {
- $match = ['feature_relationship_id' => $child->feature_relationship_id];
- $values = ['rank' => $rank];
- chado_update_record('feature_relationship', $match, $values);
- $rank++;
- }
- $i++;
- }
- }
- return 1;
- }
-
- private function loadDerivesFrom($feature, $cvterm, $object,
- $organism, $fmin, $fmax) {
- $type = $cvterm->name;
-
- $values = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $object,
- ];
- $result = chado_select_record('tripal_gff_temp', ['type_name'], $values);
- $type_id = NULL;
- if (count($result) > 0) {
- $otype = chado_get_cvterm([
- 'name' => $result[0]->type_name,
- 'cv_id' => [
- 'name' => 'sequence',
- ],
- ]);
- if ($otype) {
- $type_id = $otype->cvterm_id;
- }
- }
-
-
- if (!$type_id) {
- $result = chado_select_record('feature', ['type_id'], $values);
- if (count($result) > 1) {
- $this->logMessage("Cannot find feature type for, '!subject' , in 'derives_from' relationship. Multiple matching features exist with this uniquename.",
- ['!subject' => $object], TRIPAL_WARNING);
- return;
- }
- else {
- if (count($result) == 0) {
- $this->logMessage("Cannot find feature type for, '!subject' , in 'derives_from' relationship.",
- ['!subject' => $object], TRIPAL_WARNING);
- return '';
- }
- else {
- $type_id = $result->type_id;
- }
- }
- }
-
- $match = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $object,
- 'type_id' => $type_id,
- ];
- $ofeature = chado_select_record('feature', ['feature_id'], $match);
- if (count($ofeature) == 0) {
- $this->logMessage("Could not add 'Derives_from' relationship " .
- "for %uniquename and %subject. Subject feature, '%subject', " .
- "cannot be found.", [
- '%uniquename' => $feature->uniquename,
- '%subject' => $subject,
- ], TRIPAL_ERROR);
- return;
- }
-
- if ($type == 'protein' or $type == 'polypeptide') {
- $values = [
- 'feature_id' => $feature->feature_id,
- 'parent_id' => $ofeature[0]->feature_id,
- 'fmin' => $fmin,
- 'fmax' => $fmax,
- ];
- $result = chado_insert_record('tripal_gffprotein_temp', $values);
- if (!$result) {
- throw new Exception(t("Cound not save record in temporary protein table, Cannot continue.", []));
- }
- }
-
-
- $values = [
- 'object_id' => $ofeature[0]->feature_id,
- 'subject_id' => $feature->feature_id,
- 'type_id' => [
- 'cv_id' => [
- 'name' => 'sequence',
- ],
- 'name' => 'derives_from',
- ],
- 'rank' => 0,
- ];
- $rel = chado_select_record('feature_relationship', ['*'], $values);
- if (count($rel) > 0) {
- return;
- }
-
- $ret = chado_insert_record('feature_relationship', $values);
- if (!$ret) {
- $this->logMessage("Could not add 'Derives_from' relationship for :uniquename and :subject.",
- [
- ':uniquename' => $feature->uniquename,
- ':subject' => $subject,
- ], TRIPAL_WARNING);
- }
- }
-
- private function loadParents($feature, $cvterm, $parents,
- $organism_id, $strand, $phase, $fmin, $fmax) {
- $uname = $feature->uniquename;
- $type = $cvterm->name;
- $rel_type = 'part_of';
-
- $cvterm_sql = "
- SELECT CVT.cvterm_id
- FROM {cvterm} CVT
- INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
- LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
- WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
- ";
-
- foreach ($parents as $parent) {
-
- $values = [
- 'organism_id' => $organism_id,
- 'uniquename' => $parent,
- ];
- $result = chado_select_record('tripal_gff_temp', ['type_name'], $values);
- if (count($result) == 0) {
- $this->logMessage("Cannot find parent: %parent.", ['%parent' => $parent], TRIPAL_WARNING);
- return '';
- }
- $parent_type = $result[0]->type_name;
-
- $parentcvterm = chado_query($cvterm_sql, [
- ':cvname' => 'sequence',
- ':name' => $parent_type,
- ':synonym' => $parent_type,
- ])->fetchObject();
- $relcvterm = chado_query($cvterm_sql, [
- ':cvname' => 'sequence',
- ':name' => $rel_type,
- ':synonym' => $rel_type,
- ])->fetchObject();
- if (!$relcvterm) {
- throw new Exception(t("Cannot find the term, 'part_of', from the sequence ontology. This term is used for associating parent and children features. Please check that the ontology is fully imported."));
- }
- $values = [
- 'organism_id' => $organism_id,
- 'uniquename' => $parent,
- 'type_id' => $parentcvterm->cvterm_id,
- ];
- $result = chado_select_record('feature', ['feature_id'], $values);
- $parent_feature = $result[0];
-
- if ($parent_feature) {
-
- $values = [
- 'object_id' => $parent_feature->feature_id,
- 'subject_id' => $feature->feature_id,
- 'type_id' => $relcvterm->cvterm_id,
- ];
- $rel = chado_select_record('feature_relationship', ['*'], $values);
- if (count($rel) > 0) {
- }
- else {
-
- $values = [
- 'subject_id' => $feature->feature_id,
- 'object_id' => $parent_feature->feature_id,
- 'type_id' => $relcvterm->cvterm_id,
- ];
- $result = chado_insert_record('feature_relationship', $values);
- if (!$result) {
- $this->logMessage("Failed to insert feature relationship '$uname' ($type) $rel_type '$parent' ($parent_type).",
- [], TRIPAL_WARNING);
- }
- }
-
-
- if ($type == 'CDS') {
- $values = [
- 'feature_id' => $feature->feature_id,
- 'parent_id' => $parent_feature->feature_id,
- 'fmin' => $fmin,
- 'fmax' => $fmax,
- 'strand' => $strand,
- ];
- if (isset($phase)) {
- $values['phase'] = $phase;
- }
- $result = chado_insert_record('tripal_gffcds_temp', $values);
- if (!$result) {
- throw new Exception(t("Cound not save record in temporary CDS table, Cannot continue.", []));
- exit;
- }
- }
- }
- else {
- $this->logMessage("Cannot establish relationship '$uname' ($type) $rel_type '$parent' ($parent_type): Cannot find the parent.",
- [], TRIPAL_WARNING);
- }
- }
- }
-
- private function loadDbxref($feature, $dbxrefs) {
-
- foreach ($dbxrefs as $dbxref) {
-
- $ref = explode(":", $dbxref);
- $dbname = trim($ref[0]);
- $accession = trim($ref[1]);
-
-
-
-
- $values = ['name' => "DB:$dbname"];
- $db = chado_select_record('db', ['db_id'], $values);
- if (count($db) == 0) {
- $values = ['name' => "$dbname"];
- $db = chado_select_record('db', ['db_id'], $values);
- }
- if (count($db) == 0) {
- $values = [
- 'name' => $dbname,
- 'description' => 'Added automatically by the GFF loader',
- ];
- $success = chado_insert_record('db', $values);
- if ($success) {
- $values = ['name' => "$dbname"];
- $db = chado_select_record('db', ['db_id'], $values);
- }
- else {
- $this->logMessage("Cannot find or add the database $dbname.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- $db = $db[0];
-
- $values = [
- 'accession' => $accession,
- 'db_id' => $db->db_id,
- ];
- $dbxref = chado_select_record('dbxref', ['dbxref_id'], $values);
-
- if (sizeof($dbxref) == 0) {
- $values = [
- 'db_id' => $db->db_id,
- 'accession' => $accession,
- 'version' => '',
- ];
- $ret = chado_insert_record('dbxref', $values);
- $values = [
- 'accession' => $accession,
- 'db_id' => $db->db_id,
- ];
- $dbxref = chado_select_record('dbxref', ['dbxref_id'], $values);
- }
- $dbxref = $dbxref[0];
-
- $values = [
- 'dbxref_id' => $dbxref->dbxref_id,
- 'feature_id' => $feature->feature_id,
- ];
- $fdbx = chado_select_record('feature_dbxref', ['feature_dbxref_id'], $values);
-
-
- if (sizeof($fdbx) == 0) {
- $values = [
- 'dbxref_id' => $dbxref->dbxref_id,
- 'feature_id' => $feature->feature_id,
- ];
- $success = chado_insert_record('feature_dbxref', $values);
- if (!$success) {
- $this->logMessage("Failed to insert Dbxref: $dbname:$accession.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- }
- return 1;
- }
-
- private function loadOntology($feature, $dbxrefs) {
-
- foreach ($dbxrefs as $dbxref) {
-
- $ref = explode(":", $dbxref);
- $dbname = trim($ref[0]);
- $accession = trim($ref[1]);
-
- $db = chado_select_record('db', ['db_id'], ['name' => "DB:$dbname"]);
- if (sizeof($db) == 0) {
-
- $db = chado_select_record('db', ['db_id'], ['name' => "$dbname"]);
- if (sizeof($db) == 0) {
- $this->logMessage("Database, $dbname, is not present. Cannot associate term: $dbname:$accession.", [], TRIPAL_WARNING);
- continue;
- }
- }
- $db = $db[0];
-
- $dbxref = chado_select_record('dbxref', ['dbxref_id'],
- ['accession' => $accession, 'db_id' => $db->db_id]);
- if (sizeof($dbxref) == 0) {
- $this->logMessage("Accession, $accession is missing for reference: $dbname:$accession.", [], TRIPAL_WARNING);
- continue;
- }
- $dbxref = $dbxref[0];
-
- $cvterm = chado_select_record('cvterm', ['cvterm_id'], [
- 'dbxref_id' => $dbxref->dbxref_id,
- ]);
-
- if (sizeof($cvterm) == 0) {
- $cvterm = chado_select_record('cvterm_dbxref', ['cvterm_id'], [
- 'dbxref_id' => $dbxref->dbxref_id,
- ]);
- if (sizeof($cvterm) == 0) {
- $this->logMessage("CV Term is missing for reference: $dbname:$accession.", [], TRIPAL_WARNING);
- continue;
- }
- }
- $cvterm = $cvterm[0];
-
- $fcvt = chado_select_record('feature_cvterm', ['feature_cvterm_id'],
- [
- 'cvterm_id' => $cvterm->cvterm_id,
- 'feature_id' => $feature->feature_id,
- ]);
-
- if (sizeof($fcvt) == 0) {
- $values = [
- 'cvterm_id' => $cvterm->cvterm_id,
- 'feature_id' => $feature->feature_id,
- 'pub_id' => [
- 'uniquename' => 'null',
- ],
- ];
- $success = chado_insert_record('feature_cvterm', $values);
- if (!$success) {
- $this->logMessage("Failed to insert ontology term: $dbname:$accession.", [], TRIPAL_WARNING);
- continue;
- }
- }
- }
- }
-
- private function loadAlias($feature, $aliases) {
-
- $select = ['name' => 'synonym_type'];
- $results = chado_select_record('cv', ['*'], $select);
- if (count($results) == 0) {
-
- $values = [
- 'name' => 'synonym_type',
- 'definition' => 'vocabulary for synonym types',
- ];
- $success = chado_insert_record('cv', $values);
- if (!$success) {
- $this->logMessage("Failed to add the synonyms type vocabulary.", [], TRIPAL_WARNING);
- return 0;
- }
-
- $results = chado_select_record('cv', ['*'], $select);
- if (count($results) > 0) {
- $syncv = $results[0];
- }
- }
- else {
- $syncv = $results[0];
- }
-
- $select = [
- 'name' => 'exact',
- 'cv_id' => [
- 'name' => 'synonym_type',
- ],
- ];
- $result = chado_select_record('cvterm', ['*'], $select);
- if (count($result) == 0) {
- $term = [
- 'name' => 'exact',
- 'id' => "synonym_type:exact",
- 'definition' => '',
- 'is_obsolete' => 0,
- 'cv_name' => $syncv->name,
- 'is_relationship' => FALSE,
- ];
- $syntype = chado_insert_cvterm($term, ['update_existing' => TRUE]);
- if (!$syntype) {
- $this->logMessage("Cannot add synonym type: internal:$type.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- else {
- $syntype = $result[0];
- }
-
- foreach ($aliases as $alias) {
-
-
- $select = [
- 'name' => $alias,
- 'type_id' => $syntype->cvterm_id,
- ];
- $result = chado_select_record('synonym', ['*'], $select);
- if (count($result) == 0) {
- $values = [
- 'name' => $alias,
- 'type_id' => $syntype->cvterm_id,
- 'synonym_sgml' => '',
- ];
- $success = chado_insert_record('synonym', $values);
- if (!$success) {
- $this->logMessage("Cannot add alias $alias to synonym table.", [], TRIPAL_WARNING);
- return 0;
- }
- $result = chado_select_record('synonym', ['*'], $select);
- $synonym = $result[0];
- }
- else {
- $synonym = $result[0];
- }
-
-
- $select = ['uniquename' => 'null'];
- $result = chado_select_record('pub', ['*'], $select);
- if (count($result) == 0) {
- $pub_sql = "
- INSERT INTO {pub} (uniquename,type_id)
- VALUES (:uname,
- (SELECT cvterm_id
- FROM {cvterm} CVT
- INNER JOIN {dbxref} DBX ON DBX.dbxref_id = CVT.dbxref_id
- INNER JOIN {db} DB ON DB.db_id = DBX.db_id
- WHERE CVT.name = :type_id))
- ";
- $status = chado_query($psql);
- if (!$status) {
- $this->logMessage("Cannot prepare statement 'ins_pub_uniquename_typeid.", [], TRIPAL_WARNING);
- return 0;
- }
-
- $result = chado_query($pub_sql, [
- ':uname' => 'null',
- ':type_id' => 'null',
- ])->fetchObject();
- if (!$result) {
- $this->logMessage("Cannot add null publication needed for setup of alias.", [], TRIPAL_WARNING);
- return 0;
- }
- $result = chado_select_record('pub', ['*'], $select);
- $pub = $result[0];
- }
- else {
- $pub = $result[0];
- }
-
-
- $values = [
- 'synonym_id' => $synonym->synonym_id,
- 'feature_id' => $feature->feature_id,
- 'pub_id' => $pub->pub_id,
- ];
- $columns = ['feature_synonym_id'];
- $result = chado_select_record('feature_synonym', $columns, $values);
- if (count($result) == 0) {
- $values = [
- 'synonym_id' => $synonym->synonym_id,
- 'feature_id' => $feature->feature_id,
- 'pub_id' => $pub->pub_id,
- ];
- $success = chado_insert_record('feature_synonym', $values);
- if (!$success) {
- $this->logMessage("Cannot add alias $alias to feature synonym table.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- }
- return 1;
- }
-
- private function loadFeature($organism, $analysis_id, $cvterm, $uniquename,
- $name, $residues, $is_analysis = 'f', $is_obsolete = 'f', $add_only, $score) {
-
- $feature = NULL;
- $fselect = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $uniquename,
- 'type_id' => $cvterm->cvterm_id,
- ];
- $columns = [
- 'feature_id',
- 'name',
- 'uniquename',
- 'seqlen',
- 'organism_id',
- 'type_id',
- ];
- $result = chado_select_record('feature', $columns, $fselect);
- if (count($result) > 0) {
- $feature = $result[0];
- }
- if (strcmp($is_obsolete, 'f') == 0 or $is_obsolete == 0) {
- $is_obsolete = 'FALSE';
- }
- if (strcmp($is_obsolete, 't') == 0 or $is_obsolete == 1) {
- $is_obsolete = 'TRUE';
- }
- if (strcmp($is_analysis, 'f') == 0 or $is_analysis == 0) {
- $is_analysis = 'FALSE';
- }
- if (strcmp($is_analysis, 't') == 0 or $is_analysis == 1) {
- $is_analysis = 'TRUE';
- }
-
- if (!$feature) {
- $values = [
- 'organism_id' => $organism->organism_id,
- 'name' => $name,
- 'uniquename' => $uniquename,
- 'md5checksum' => md5($residues),
- 'type_id' => $cvterm->cvterm_id,
- 'is_analysis' => $is_analysis,
- 'is_obsolete' => $is_obsolete,
- ];
- $feature = (object) chado_insert_record('feature', $values);
- if (!$feature) {
- $this->logMessage("Failed to insert feature '$uniquename' ($cvterm->name).", [], TRIPAL_WARNING);
- return 0;
- }
- }
- elseif (!$add_only) {
- $values = [
- 'name' => $name,
- 'md5checksum' => md5($residues),
- 'is_analysis' => $is_analysis,
- 'is_obsolete' => $is_obsolete,
- ];
- $match = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $uniquename,
- 'type_id' => $cvterm->cvterm_id,
- ];
- $result = chado_update_record('feature', $match, $values);
- if (!$result) {
- $this->logMessage("Failed to update feature '$uniquename' ($cvterm->name).", [], TRIPAL_WARNING);
- return 0;
- }
- }
- else {
-
-
- return $feature;
- }
-
-
- $af_values = [
- 'analysis_id' => $analysis_id,
- 'feature_id' => $feature->feature_id,
- ];
- $afeature = chado_select_record('analysisfeature', ['analysisfeature_id'], $af_values);
- if (count($afeature) == 0) {
-
- if (strcmp($score, '.') != 0) {
- $af_values['significance'] = $score;
- }
- if (!chado_insert_record('analysisfeature', $af_values)) {
- $this->logMessage("Could not add analysisfeature record: $analysis_id, $feature->feature_id.", [], TRIPAL_WARNING);
- }
- }
- else {
-
- $new_vals = [];
- if (strcmp($score, '.') != 0) {
- $new_vals['significance'] = $score;
- }
- else {
- $new_vals['significance'] = '__NULL__';
- }
- if (!$add_only) {
- $ret = chado_update_record('analysisfeature', $af_values, $new_vals);
- if (!$ret) {
- $this->logMessage("Could not update analysisfeature record: $analysis_id, $feature->feature_id.", [], TRIPAL_WARNING);
- }
- }
- }
- return $feature;
- }
-
- private function loadFeatureLoc($feature, $organism, $landmark, $fmin,
- $fmax, $strand, $phase, $is_fmin_partial, $is_fmax_partial, $residue_info, $locgroup,
- $landmark_type_id = '', $landmark_organism_id = '', $create_landmark = 0,
- $landmark_is_target = 0) {
- $select = [
- 'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id,
- 'uniquename' => $landmark,
- ];
- if ($landmark_type_id) {
- $select['type_id'] = $landmark_type_id;
- }
- $results = chado_select_record('feature', ['feature_id'], $select);
- $srcfeature = '';
- if (count($results) == 0) {
-
-
- $select = [
- 'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id,
- 'name' => $landmark,
- ];
- if ($landmark_type_id) {
- $select['type_id'] = $landmark_type_id;
- }
- $results = chado_select_record('feature', ['feature_id'], $select);
- if (count($results) == 0) {
-
-
- if ($landmark_is_target) {
- $select = ['uniquename' => $landmark];
- $results = chado_select_record('feature', ['feature_id'], $select);
- if (count($results) == 1) {
- $srcfeature = $results[0];
- }
- }
- if (!$srcfeature) {
-
-
- if ($create_landmark and $landmark_type_id) {
- $values = [
- 'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id,
- 'name' => $landmark,
- 'uniquename' => $landmark,
- 'type_id' => $landmark_type_id,
- ];
- $results = chado_insert_record('feature', $values);
- if (!$results) {
- $this->logMessage("Cannot find landmark feature: '%landmark', nor could it be inserted.",
- ['%landmark' => $landmark], TRIPAL_WARNING);
- return 0;
- }
- $srcfeature = new stdClass();
- $srcfeature->feature_id = $results['feature_id'];
- }
- else {
- $this->logMessage("Cannot find unique landmark feature: '%landmark'.",
- ['%landmark' => $landmark], TRIPAL_WARNING);
- return 0;
- }
- }
- }
- elseif (count($results) > 1) {
- $this->logMessage("multiple landmarks exist with the name: '%landmark'. Cannot " .
- "resolve which one to use. Cannot add the feature location record.",
- ['%landmark' => $landmark], TRIPAL_WARNING);
- return 0;
- }
- else {
- $srcfeature = $results[0];
- }
- }
- elseif (count($results) > 1) {
- $this->logMessage("multiple landmarks exist with the name: '%landmark'. Cannot " .
- "resolve which one to use. Cannot add the feature location record.",
- ['%landmark' => $landmark, TRIPAL_WARNING]);
- return 0;
- }
- else {
- $srcfeature = $results[0];
- }
-
-
-
-
-
- $rank = 0;
- $exists = 0;
- $select = ['feature_id' => $feature->feature_id];
- $options = [
- 'order_by' => [
- 'rank' => 'ASC',
- ],
- ];
- $locrecs = chado_select_record('featureloc', ['*'], $select, $options);
- foreach ($locrecs as $featureloc) {
-
-
- if (!$featureloc->srcfeature_id) {
- continue;
- }
- $select = ['feature_id' => $featureloc->srcfeature_id];
- $columns = ['feature_id', 'name'];
- $locsfeature = chado_select_record('feature', $columns, $select);
-
-
- if (strcmp($locsfeature[0]->name, $landmark) == 0 and
- ($featureloc->fmin == $fmin or $featureloc->fmax == $fmax)) {
- $match = ['featureloc_id' => $featureloc->featureloc_id];
- $values = [];
- $exists = 1;
- if ($featureloc->fmin != $fmin) {
- $values['fmin'] = $fmin;
- }
- if ($featureloc->fmax != $fmax) {
- $values['fmax'] = $fmax;
- }
- if ($featureloc->strand != $strand) {
- $values['strand'] = $strand;
- }
- if (count($values) > 0) {
- chado_update_record('featureloc', $match, $values);
- }
- }
- $rank = $featureloc->rank + 1;
- }
- if (!$exists) {
-
- if (strcmp($is_fmin_partial, 'f') == 0 or !$is_fmin_partial) {
- $is_fmin_partial = 'FALSE';
- }
- elseif (strcmp($is_fmin_partial, 't') == 0 or $is_fmin_partial = 1) {
- $is_fmin_partial = 'TRUE';
- }
- if (strcmp($is_fmax_partial, 'f') == 0 or !$is_fmax_partial) {
- $is_fmax_partial = 'FALSE';
- }
- elseif (strcmp($is_fmax_partial, 't') == 0 or $is_fmax_partial = 1) {
- $is_fmax_partial = 'TRUE';
- }
- $values = [
- 'feature_id' => $feature->feature_id,
- 'srcfeature_id' => $srcfeature->feature_id,
- 'fmin' => $fmin,
- 'is_fmin_partial' => $is_fmin_partial,
- 'fmax' => $fmax,
- 'is_fmax_partial' => $is_fmax_partial,
- 'strand' => $strand,
- 'residue_info' => $residue_info,
- 'locgroup' => $locgroup,
- 'rank' => $rank,
- ];
- if ($phase) {
- $values['phase'] = $phase;
- }
- $success = chado_insert_record('featureloc', $values);
- if (!$success) {
- throw new Exception("Failed to insert featureloc.");
- }
- }
- return 1;
- }
-
- private function loadProperty($feature, $property, $value) {
-
- $select = [
- 'name' => $property,
- 'cv_id' => [
- 'name' => 'feature_property',
- ],
- ];
- $result = chado_select_record('cvterm', ['*'], $select);
-
-
- if (count($result) == 0) {
- $term = [
- 'id' => "local:$property",
- 'name' => $property,
- 'is_obsolete' => 0,
- 'cv_name' => 'feature_property',
- 'db_name' => 'local',
- 'is_relationship' => FALSE,
- ];
- $cvterm = (object) chado_insert_cvterm($term, ['update_existing' => FALSE]);
- if (!$cvterm) {
- $this->logMessage("Cannot add cvterm, $property.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- else {
- $cvterm = $result[0];
- }
-
-
-
- $add = 1;
- $rank = 0;
- $select = [
- 'feature_id' => $feature->feature_id,
- 'type_id' => $cvterm->cvterm_id,
- ];
- $options = [
- 'order_by' => [
- 'rank' => 'ASC',
- ],
- ];
- $results = chado_select_record('featureprop', ['*'], $select, $options);
- foreach ($results as $prop) {
- if (strcmp($prop->value, $value) == 0) {
- $add = NULL;
- }
- $rank = $prop->rank + 1;
- }
-
- if ($add) {
- $values = [
- 'feature_id' => $feature->feature_id,
- 'type_id' => $cvterm->cvterm_id,
- 'value' => $value,
- 'rank' => $rank,
- ];
- $result = chado_insert_record('featureprop', $values);
- if (!$result) {
- $this->logMessage("cannot add featureprop, $property.", [], TRIPAL_WARNING);
- }
- }
- }
-
- private function loadFasta($fh, $interval, &$num_read, &$line_num, $filesize) {
- $this->logMessage("Loading FASTA sequences...");
- $residues = '';
- $id = NULL;
-
- while ($line = fgets($fh)) {
- $line_num++;
- $size = drupal_strlen($line);
- $this->addItemsHandled($size);
- $num_read += $size;
- $line = trim($line);
-
-
- if (preg_match('/^>/', $line)) {
-
- if ($id) {
- $values = ['uniquename' => $id];
- $result = chado_select_record('tripal_gff_temp', ['*'], $values);
- if (count($result) == 0) {
- $this->logMessage('Cannot find feature to assign FASTA sequence: %uname.',
- ['%uname' => $id], TRIPAL_WARNING);
- }
- else {
-
- $feature = $result[0];
- $values = [
- 'residues' => $residues,
- 'seqlen' => strlen($residues),
- ];
- $match = ['feature_id' => $feature->feature_id];
- chado_update_record('feature', $match, $values);
- }
- }
-
-
- $id = preg_replace('/^>([^\s]+).*$/', '\1', $line);
- $residues = '';
- }
- else {
- $residues .= trim($line);
- }
- }
-
- $values = ['uniquename' => $id];
- $result = chado_select_record('tripal_gff_temp', ['*'], $values);
- if (count($result) == 0) {
- $this->logMessage('Cannot find feature to assign FASTA sequence: %uname.',
- ['%uname' => $id], TRIPAL_WARNING);
- }
- else {
-
- $feature = $result[0];
- $values = [
- 'residues' => $residues,
- 'seqlen' => strlen($residues),
- ];
- $match = ['feature_id' => $feature->feature_id];
- chado_update_record('feature', $match, $values);
- }
- }
-
- private function loadTarget($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup) {
-
- $matched = preg_match('/^(.*?)\s+(\d+)\s+(\d+)(\s+[\+|\-])*$/', trim($tags['Target'][0]), $matches);
-
-
- $gff_target_organism = array_key_exists('target_organism', $tags) ? $tags['target_organism'][0] : '';
- $gff_target_type = array_key_exists('target_type', $tags) ? $tags['target_type'][0] : '';
-
- if ($matched) {
- $target_feature = $matches[1];
- $start = $matches[2];
- $end = $matches[3];
-
- if ($matches[4]) {
- if (preg_match('/^\+$/', trim($matches[4]))) {
- $target_strand = 1;
- }
- elseif (preg_match('/^\-$/', trim($matches[4]))) {
- $target_strand = -1;
- }
- else {
- $target_strand = 0;
- }
- }
- else {
- $target_strand = 0;
- }
- $target_fmin = $start - 1;
- $target_fmax = $end;
- if ($end < $start) {
- $target_fmin = $end - 1;
- $target_fmax = $start;
- }
-
-
- $t_organism_id = $target_organism_id;
- if ($gff_target_organism) {
-
- $success = preg_match('/^(.*?):(.*?)$/', $gff_target_organism, $matches);
- if ($success) {
- $values = [
- 'genus' => $matches[1],
- 'species' => $matches[2],
- ];
- $torganism = chado_select_record('organism', ['organism_id'], $values);
- if (count($torganism) == 1) {
- $t_organism_id = $torganism[0]->organism_id;
- }
- else {
- $this->logMessage("Cannot find organism for target %target.",
- ['%target' => $gff_target_organism], TRIPAL_WARNING);
- $t_organism_id = '';
- }
- }
- else {
- $this->logMessage("The target_organism attribute is improperly formatted: %target. " .
- "It should be target_organism=genus:species.",
- ['%target' => $gff_target_organism], TRIPAL_WARNING);
- $t_organism_id = '';
- }
- }
-
-
- $t_type_id = '';
- if ($target_type) {
- $values = [
- 'name' => $target_type,
- 'cv_id' => [
- 'name' => 'sequence',
- ],
- ];
- $type = chado_select_record('cvterm', ['cvterm_id'], $values);
- if (count($type) == 1) {
- $t_type_id = $type[0]->cvterm_id;
- }
- else {
- throw new Exception(t("The target type does not exist in the sequence ontology: %type. ",
- ['%type' => $target_type]));
- }
- }
- if ($gff_target_type) {
- $values = [
- 'name' => $gff_target_type,
- 'cv_id' => [
- 'name' => 'sequence',
- ],
- ];
-
- $type = chado_select_record('cvterm', ['cvterm_id'], $values);
- if (count($type) == 1) {
- $t_type_id = $type[0]->cvterm_id;
- }
- else {
-
- $sql = "
- SELECT CVTS.cvterm_id
- FROM {cvtermsynonym} CVTS
- INNER JOIN {cvterm} CVT ON CVT.cvterm_id = CVTS.cvterm_id
- INNER JOIN {cv} CV ON CV.cv_id = CVT.cv_id
- WHERE CV.name = 'sequence' and CVTS.synonym = :synonym
- ";
- $synonym = chado_query($sql, [':synonym' => $gff_target_type])->fetchObject();
- if ($synonym) {
- $t_type_id = $synonym->cvterm_id;
- }
- else {
- $this->logMessage("The target_type attribute does not exist in the sequence ontology: %type.",
- ['%type' => $gff_target_type], TRIPAL_WARNING);
- $t_type_id = '';
- }
- }
- }
-
-
- $this->loadFeatureLoc($feature, $organism, $target_feature, $target_fmin,
- $target_fmax, $target_strand, $phase, $attr_fmin_partial, $attr_fmax_partial, $attr_residue_info,
- $attr_locgroup, $t_type_id, $t_organism_id, $create_target, TRUE);
- }
-
- else {
- $this->logMessage("Could not add 'Target' alignment as it is improperly formatted: '%target'",
- ['%target' => $tags['Target'][0]], TRIPAL_ERROR);
- }
- }
- }
|