123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907 |
- <?php
- class GFF3Importer extends TripalImporter {
- /**
- * The name of this loader. This name will be presented to the site
- * user.
- */
- public static $name = 'Chado GFF3 File Loader';
- /**
- * The machine name for this loader. This name will be used to construct
- * the URL for the loader.
- */
- public static $machine_name = 'chado_gff3_loader';
- /**
- * A brief description for this loader. This description will be
- * presented to the site user.
- */
- public static $description = 'Import a GFF3 file into Chado';
- /**
- * An array containing the extensions of allowed file types.
- */
- public static $file_types = ['gff', 'gff3'];
- /**
- * Provides information to the user about the file upload. Typically this
- * may include a description of the file types allowed.
- */
- public static $upload_description = 'Please provide the GFF3 file.';
- /**
- * The title that should appear above the upload button.
- */
- public static $upload_title = 'GFF3 File';
- /**
- * Text that should appear on the button at the bottom of the importer
- * form.
- */
- public static $button_text = 'Import GFF3 file';
- /**
- * A handle to a temporary file for caching the GFF features. This allows for
- * quick lookup of parsed features without having to store it in RAM.
- */
- private $gff_cache_file = NULL;
- /**
- * The name of the temporary cache file.
- */
- private $gff_cache_file_name = NULL;
- /**
- * The lines from the ##sequence-region at the top of the GFF
- */
- private $seq_region_headers = [];
- /**
- * The path to the GFF3 file.
- */
- private $gff_file = NULL;
- /**
- * The file handle for the GFF3 file.
- */
- private $gff_file_h = NULL;
- /**
- * The organism ID for this GFF file.
- */
- private $organism_id = NULL;
- /**
- * The organism ChadoRecord object that corresponds to the $organism_id value.
- */
- private $organism = NULL;
- /**
- * An array of organism records for quick lookup.
- */
- private $organism_lookup = [];
- /**
- * The analysis ID for this GFF file
- */
- private $analysis_id = NULL;
- /**
- * The analysis ChadoRecord object that corresponds to the $analysis_id value.
- */
- private $analysis = NULL;
- /**
- * A flag indicating if only new items should be added (no updates)
- */
- private $add_only = NULL;
- /**
- * A flag indicting if only existing items should be updated.
- */
- private $update = TRUE;
- /**
- * If the GFF file contains a 'Target' attribute then the feature and the
- * target will have an alignment created, but to find the proper target
- * feature the target organism must also be known. If different from the
- * organism specified for the GFF file, then use this argument to specify
- * the target organism. Only use this argument if all target sequences
- * belong to the same species. If the targets in the GFF file belong to
- * multiple different species then the organism must be specified using the
- * 'target_organism=genus:species' attribute in the GFF file. Default is
- * NULL.
- */
- private $target_organism_id = NULL;
- /**
- * If the GFF file contains a 'Target' attribute then the feature and the
- * target will have an alignment created, but to find the proper target
- * feature the target organism must also be known. This can be used to
- * specify the target feature type to help with identification of the
- * target feature. Only use this argument if all target sequences types are
- * the same. If the targets are of different types then the type must be
- * specified using the 'target_type=type' attribute in the GFF file. This
- * must be a valid Sequence Ontology (SO) term. Default is NULL
- */
- private $target_type = NULL;
- private $target_type_id = NULL;
- /**
- * A flag indicating if the target feature should be created. If FALSE
- * then it should already exist.
- */
- private $create_target = FALSE;
- /**
- * Set this to the line in the GFF file where importing should start. This
- * is useful for testing and debugging GFF files that may have problems and
- * you want to start at a particular line to speed testing. Default = 1
- */
- private $start_line = 1;
- /**
- * During parsing of the GFF file this keeps track of the current line
- * number.
- */
- private $current_line = 0;
- /**
- * A Sequence Ontology term name for the landmark sequences in the GFF
- * file (e.g. 'chromosome'), if the GFF file contains a '##sequence-region'
- * line that describes the landmark sequences. Default = ''
- */
- private $landmark_type = '';
- /**
- * The ChadoRecord object for the landmark type cvterm.
- */
- private $landmark_cvterm = NULL;
- /**
- * Regular expression to pull out the mRNA name.
- */
- private $re_mrna = '';
- /**
- * Regular expression to pull out the protein name.
- */
- private $re_protein = '';
- /**
- * A flag that indicates if a protein record should be created.
- * @var integer
- */
- private $skip_protein = 0;
- /**
- * Sometimes lines in the GFF file are missing the required ID attribute
- * that specifies the unique name of the feature. If so, you may specify
- * the name of an existing attribute to use for the ID.
- */
- private $alt_id_attr = '';
- /**
- * The Tripal GFF loader supports the "organism" attribute. This allows
- * features of a different organism to be aligned to the landmark sequence
- * of another species. The format of the attribute is
- * "organism=[genus]:[species]", where [genus] is the organism's genus and
- * [species] is the species name. Check this box to automatically add the
- * organism to the database if it does not already exists. Otherwise lines
- * with an oraganism attribute where the organism is not present in the
- * database will be skipped.
- */
- private $create_organism = FALSE;
- /**
- * Holds mapping of DB names to DB ids.
- */
- private $db_lookup = [];
- /**
- * Holds a mapping of Dbxref names to ids.
- */
- private $dbxref_lookup = [];
- /**
- * Holds a mapping of Dbxref names to cvterm ids.
- */
- private $cvterm_lookup = [];
- /**
- * Holds a mapping of synonymns to ids.
- */
- private $synonym_lookup = [];
- /**
- * Maps parents to their children and contains the ranks of the children.
- */
- private $parent_lookup = [];
- /**
- * An array that stores CVterms that have been looked up so we don't have
- * to do the database query every time.
- */
- private $feature_cvterm_lookup = [];
- /**
- * An array that stores CVterms that have been looked up so we don't have
- * to do the database query every time.
- */
- private $featureprop_cvterm_lookup = [];
- /**
- * Holds the CV term for the "exact" synonym.
- */
- private $exact_syn = NULL;
- /**
- * Holds the object for the null publication record.
- */
- private $null_pub = NULL;
- /**
- * The list of features from the GFF3 file. Each element is an
- * associative array of the columns from the GFF3 file, with the attribute
- * field being an associative array of key/value pairs.
- */
- private $features = [];
- /**
- * An associatiave array containing the pointers to the FASTA sequences
- * in the GFF file. We don't want to load these into memory as they
- * may be too big!
- */
- private $residue_index = [];
- /**
- * An array that stores landmarks objects. Landmarks should be inserted
- * first if they don't already exist.
- */
- private $landmarks = [];
- /**
- * A controlled vocabulary ChadoRecord object. This is the CV that will be
- * used to for feature properties.
- */
- private $feature_prop_cv = NULL;
- /**
- * A controlled vocabulary ChadoRecord object. This is the CV that will be
- * used to for feature properties.
- */
- private $feature_cv = NULL;
- /**
- * @see TripalImporter::form()
- */
- public function form($form, &$form_state) {
- // get the list of organisms
- $sql = "SELECT * FROM {organism} ORDER BY genus, species";
- $org_rset = chado_query($sql);
- $organisms = [];
- $organisms[''] = '';
- while ($organism = $org_rset->fetchObject()) {
- $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
- }
- $form['organism_id'] = [
- '#title' => t('Existing Organism'),
- '#type' => 'select',
- '#description' => t("Choose an existing organism to which the entries in the GFF file will be associated."),
- '#required' => TRUE,
- '#options' => $organisms,
- ];
- $form['create_organism'] = [
- '#type' => 'checkbox',
- '#title' => t('Create organism'),
- '#required' => FALSE,
- '#description' => t('The Tripal GFF loader supports the "organism" attribute. This allows features of a
- different organism to be aligned to the landmark sequence. The format of the
- attribute is "organism=[genus]:[species]", where [genus] is the organism\'s genus and [species] is the
- species name. Check this box to automatically add the organism to the database if it does not already exists.
- Otherwise lines with an organism attribute where the organism is not present in the database will be skipped.'),
- ];
- $form['landmark_type'] = [
- '#title' => t('Landmark Type'),
- '#type' => 'textfield',
- '#description' => t("Optional. Use this field to specify a Sequence Ontology type
- for the landmark sequences in the GFF fie (e.g. 'chromosome'). This is only needed if
- the landmark features (first column of the GFF3 file) are not already in the database.."),
- ];
- $form['proteins'] = [
- '#type' => 'fieldset',
- '#title' => t('Proteins'),
- '#collapsible' => TRUE,
- '#collapsed' => FALSE,
- ];
- $form['proteins']['skip_protein'] = [
- '#type' => 'checkbox',
- '#title' => t('Skip automatic protein creation'),
- '#required' => FALSE,
- '#description' => t('The GFF loader will automatically create a protein feature for each transcript in the GFF file if a protein feature is missing in the GFF file. Check this box to disable this functionality. Protein features that are specifically present in the GFF will always be created.'),
- '#default_value' => 0,
- ];
- $form['proteins']['re_mrna'] = [
- '#type' => 'textfield',
- '#title' => t('Optional. Regular expression for the mRNA name'),
- '#required' => FALSE,
- '#description' => t('If automatic protein creation is enabled, then by default the loader will give each protein a name based on the name of the corresponding mRNA followed by the "-protein" suffix.
- If you want to customize the name of the created protein, you can enter a regular expression that will extract portions of
- the mRNA unique name. For example, for a
- mRNA with a unique name finishing by -RX (e.g. SPECIES0000001-RA),
- the regular expression would be, "^(.*?)-R([A-Z]+)$". Elements surrounded by parentheses are captured as backreferences and can be used for replacement.' ),
- ];
- $form['proteins']['re_protein'] = [
- '#type' => 'textfield',
- '#title' => t('Optional. Replacement string for the protein name'),
- '#required' => FALSE,
- '#description' => t('If a regular expression is used to specify a protein name you can use the backreference tokens to extract the portion of the mRNA name that you want to use for a protein.
- You use a dollar sign followed by a number to indicate the backreferences. For example: "$1-P$2".'),
- ];
- $form['targets'] = [
- '#type' => 'fieldset',
- '#title' => t('Targets'),
- '#collapsible' => TRUE,
- '#collapsed' => FALSE,
- ];
- $form['targets']['adesc'] = [
- '#markup' => t("When alignments are represented in the GFF file (e.g. such as
- alignments of cDNA sequences to a whole genome, or blast matches), they are
- represented using two feature types: 'match' (or cDNA_match, EST_match, etc.)
- and 'match_part'. These features may also have a 'Target' attribute to
- specify the sequence that is being aligned.
- However, the organism to which the aligned sequence belongs may not be present in the
- GFF file. Here you can specify the organism and feature type of the target sequences.
- The options here will apply to all targets unless the organism and type are explicity
- set in the GFF file using the 'target_organism' and 'target_type' attributes."),
- ];
- $form['targets']['target_organism_id'] = [
- '#title' => t('Target Organism'),
- '#type' => t('select'),
- '#description' => t("Optional. Choose the organism to which target sequences belong.
- Select this only if target sequences belong to a different organism than the
- one specified above. And only choose an organism here if all of the target sequences
- belong to the same species. If the targets in the GFF file belong to multiple
- different species then the organism must be specified using the 'target_organism=genus:species'
- attribute in the GFF file."),
- '#options' => $organisms,
- ];
- $form['targets']['target_type'] = [
- '#title' => t('Target Type'),
- '#type' => t('textfield'),
- '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein
- and an mRNA have the same name) then you must specify the type for all targets in the GFF file. If
- the targets are of different types then the type must be specified using the 'target_type=type' attribute
- in the GFF file. This must be a valid Sequence Ontology (SO) term."),
- ];
- $form['targets']['create_target'] = [
- '#type' => 'checkbox',
- '#title' => t('Create Target'),
- '#required' => FALSE,
- '#description' => t("If the target feature cannot be found, create one using the organism and type specified above, or
- using the 'target_organism' and 'target_type' fields specified in the GFF file. Values specified in the
- GFF file take precedence over those specified above."),
- ];
- // Advanced Options
- $form['advanced'] = [
- '#type' => 'fieldset',
- '#title' => t('Additional Options'),
- '#collapsible' => TRUE,
- '#collapsed' => FALSE,
- ];
- $form['advanced']['line_number'] = [
- '#type' => 'textfield',
- '#title' => t('Start Line Number'),
- '#description' => t('Enter the line number in the GFF file where you would like to begin processing. The
- first line is line number 1. This option is useful for examining loading problems with large GFF files.'),
- '#size' => 10,
- ];
- $form['advanced']['alt_id_attr'] = [
- '#title' => t('ID Attribute'),
- '#type' => t('textfield'),
- '#description' => t("Optional. Sometimes lines in the GFF file are missing the
- required ID attribute that specifies the unique name of the feature, but there
- may be another attribute that can uniquely identify the feature. If so,
- you may specify the name of the attribute to use for the name."),
- ];
- return $form;
- }
- /**
- * @see TripalImporter::formValidate()
- */
- public function formValidate($form, &$form_state) {
- $organism_id = $form_state['values']['organism_id'];
- $target_organism_id = $form_state['values']['target_organism_id'];
- $target_type = trim($form_state['values']['target_type']);
- $create_target = $form_state['values']['create_target'];
- $create_organism = $form_state['values']['create_organism'];
- $add_only = $form_state['values']['add_only'];
- $update = $form_state['values']['update'];
- $refresh = 0; //$form_state['values']['refresh'];
- $remove = 0; //$form_state['values']['remove'];
- $line_number = trim($form_state['values']['line_number']);
- $landmark_type = trim($form_state['values']['landmark_type']);
- $alt_id_attr = trim($form_state['values']['alt_id_attr']);
- $re_mrna = trim($form_state['values']['re_mrna']);
- $re_protein = trim($form_state['values']['re_protein']);
- // @coder-ignore: there are no functions being called here
- if (($add_only AND ($update OR $refresh OR $remove)) OR
- ($update AND ($add_only OR $refresh OR $remove)) OR
- ($refresh AND ($update OR $add_only OR $remove)) OR
- ($remove AND ($update OR $refresh OR $add_only))) {
- form_set_error('add_only', t("Please select only one checkbox from the import options section"));
- }
- if ($line_number and !is_numeric($line_number) or $line_number < 0) {
- form_set_error('line_number', t("Please provide an integer line number greater than zero."));
- }
- if (!($re_mrna and $re_protein) and ($re_mrna or $re_protein)) {
- form_set_error('re_uname', t("You must provide both a regular expression for mRNA and a replacement string for protein"));
- }
- // check the regular expression to make sure it is valid
- set_error_handler(function () {
- }, E_WARNING);
- $result_re = preg_match("/" . $re_mrna . "/", NULL);
- $result = preg_replace("/" . $re_mrna . "/", $re_protein, NULL);
- restore_error_handler();
- if ($result_re === FALSE) {
- form_set_error('re_mrna', 'Invalid regular expression.');
- }
- else {
- if ($result === FALSE) {
- form_set_error('re_protein', 'Invalid replacement string.');
- }
- }
- }
- /**
- * @see TripalImporter::run()
- */
- public function run() {
- $arguments = $this->arguments['run_args'];
- $this->gff_file = $this->arguments['files'][0]['file_path'];
- // Set the private member variables of this class using the loader inputs.
- $this->organism_id = $arguments['organism_id'];
- $this->analysis_id = $arguments['analysis_id'];
- $this->add_only = $arguments['add_only'];
- $this->update = $arguments['update'];
- $this->target_organism_id = $arguments['target_organism_id'];
- $this->target_type = $arguments['target_type'];
- $this->create_target = $arguments['create_target'];
- $this->start_line = $arguments['line_number'];
- $this->landmark_type = $arguments['landmark_type'];
- $this->alt_id_attr = $arguments['alt_id_attr'];
- $this->create_organism = $arguments['create_organism'];
- $this->re_mrna = $arguments['re_mrna'];
- $this->re_protein = $arguments['re_protein'];
- $this->skip_protein = $arguments['skip_protein'];
- // Check to see if the file is located local to Drupal
- $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $this->gff_file;
- if (!file_exists($dfile)) {
- $this->gff_file = $dfile;
- }
- // If the file is not local to Drupal check if it exists on the system.
- else if (!file_exists($this->gff_file)) {
- throw new Exception(t("Cannot find the file: !file", ['!file' => $this->gff_file]));
- }
- // Open the GFF3 file.
- $this->logMessage("Opening !gff_file", ['!gff_file' => $this->gff_file]);
- $this->gff_file_h = fopen($this->gff_file, 'r');
- if (!$this->gff_file_h) {
- throw new Exception(t("Cannot open file: !file", ['!file' => $this->gff_file]));
- }
- // Get the feature property CV object
- $this->feature_prop_cv = new ChadoRecord('cv');
- $this->feature_prop_cv->setValues(['name' => 'feature_property']);
- $num_found = $this->feature_prop_cv->find();
- if ($num_found == 0) {
- throw new Exception(t("Cannot find the 'feature_property' ontology'", []));
- }
- // Get the sequence CV object.
- $this->feature_cv = new ChadoRecord('cv');
- $this->feature_cv->setValues(['name' => 'sequence']);
- $num_found = $this->feature_cv->find();
- if ($num_found == 0) {
- throw new Exception(t("Cannot find the 'sequence' ontology'", []));
- }
- // Get the organism object.
- $this->organism = new ChadoRecord('organism');
- $this->organism->setValues(['organism_id' => $this->organism_id]);
- $num_found = $this->organism->find();
- if ($num_found == 0) {
- throw new Exception(t("Cannot find the specified organism for this GFF3 file."));
- }
- // Get the analysis object.
- $this->analysis = new ChadoRecord('analysis');
- $this->analysis->setValues(['analysis_id' => $this->analysis_id]);
- $num_found = $this->analysis->find();
- if ($num_found == 0) {
- throw new Exception(t("Cannot find the specified organism for this GFF3 file."));
- }
- // If a landmark type was provided then get that object.
- if ($this->landmark_type) {
- $this->landmark_cvterm = new ChadoRecord('cvterm');
- $this->landmark_cvterm->setValues([
- 'cv_id' => $this->feature_cv->getValue('cv_id'),
- 'name' => $this->landmark_type,
- ]);
- $num_found = $this->landmark_cvterm->find();
- if ($num_found == 0) {
- throw new Exception(t('Cannot find landmark feature type \'%landmark_type\'.', ['%landmark_type' => $this->landmark_type]));
- }
- }
- // If a target type is provided then get the ID.
- if ($this->target_type) {
- $target_type = new ChadoRecord('cvterm');
- $target_type->setValues([
- 'name' => $this->target_type,
- 'cv_id' => $this->feature_cv->getID()
- ]);
- $num_found = $target_type->find();
- if ($num_found == 0) {
- throw new Exception(t("Cannot find the specified target type, !type.", ['!type' => $this->target_type]));
- }
- $this->target_type_id = $target_type->getID();
- }
- // Create the cache file for storing parsed GFF entries.
- $this->openCacheFile();
- // Load the GFF3.
- try {
- $this->logMessage("Step 1: Caching GFF3 file... ");
- $this->parseGFF3();
- // Prep the database for necessary records.
- $this->prepSynonms();
- $this->prepNullPub();
- $this->prepDBs();
- $this->logMessage("Step 2: Insert new landmarks sequences... ");
- $this->findLandmarks();
- $this->insertLandmarks();
- $this->logMessage("Step 3: Find existing features... ");
- $this->findFeatures();
- $this->logMessage("Step 4: Prepare for any updates ... ");
- $this->deleteFeatureData();
- $this->logMessage("Step 5: Processing !num_features features... ",
- ['!num_features' => count(array_keys($this->features))]);
- $this->insertFeatures();
- $this->logMessage("Step 6: Get new feature IDs... ");
- $this->findFeatures();
- $this->logMessage("Step 7: Insert locations... ");
- $this->insertFeatureLocs();
- $this->logMessage("Step 8: Insert properties... ");
- $this->insertFeatureProps();
- $this->logMessage("Step 9: Find synonyms (aliases)... ");
- $this->findSynonyms();
- $this->logMessage("Step 10: Insert new synonyms (aliases)... ");
- $this->insertSynonyms();
- $this->logMessage("Step 11: Insert feature synonyms (aliases)... ");
- $this->insertFeatureSynonyms();
- $this->logMessage("Step 12: Find cross references... ");
- $this->findDbxrefs();
- $this->logMessage("Step 13: Insert new cross references... ");
- $this->insertDbxrefs();
- $this->logMessage("Step 14: Get new cross references IDs... ");
- $this->findDbxrefs();
- $this->logMessage("Step 15: Insert feature cross references... ");
- $this->insertFeatureDbxrefs();
- $this->logMessage("Step 16: Insert feature ontology terms... ");
- $this->insertFeatureCVterms();
- $this->logMessage("Step 17: Add child-parent relationships... ");
- $this->findChildRanks();
- $this->insertFeatureParents();
- $this->logMessage("Step 18: Insert 'derives_from' relationships... ");
- $this->insertFeatureDerivesFrom();
- $this->logMessage("Step 19: Insert Targets... ");
- $this->insertFeatureTargets();
- $this->logMessage("Step 20: Associate features with analysis.... ");
- $this->insertFeatureAnalysis();
- if (!empty($this->residue_index)) {
- $this->logMessage("Step 22: Adding sequences if available... ");
- //$this->insertFeatureSeqs();
- }
- }
- // On exception, catch the error, clean up the cache file and rethrow
- catch (Exception $e) {
- $this->closeCacheFile();
- throw $e;
- }
- }
- /**
- * Load a controlled vocabulary term.
- *
- * This method first checks if the term has already been loaded in the
- * feature_cvterm_lookup array, which helps a lot with performance.
- *
- * @param $type
- * @param $cv_id
- *
- * @ingroup gff3_loader
- */
- private function getTypeID($type, $is_prop_type) {
- $cv = $this->feature_cv;
- if ($is_prop_type) {
- $cv = $this->feature_prop_cv;
- }
- if ($is_prop_type) {
- if(array_key_exists(strtolower($type), $this->featureprop_cvterm_lookup)) {
- return $this->featureprop_cvterm_lookup[strtolower($type)];
- }
- }
- elseif (array_key_exists(strtolower($type), $this->feature_cvterm_lookup)) {
- return $this->feature_cvterm_lookup[strtolower($type)];
- }
- $sel_cvterm_sql = "
- SELECT CVT.cvterm_id
- FROM {cvterm} CVT
- LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
- WHERE CVT.cv_id = :cv_id and
- (lower(CVT.name) = lower(:name) or lower(CVTS.synonym) = lower(:synonym))
- ";
- $result = chado_query($sel_cvterm_sql, [
- ':cv_id' => $cv->getValue('cv_id'),
- ':name' => $type,
- ':synonym' => $type,
- ]);
- $cvterm_id = $result->fetchField();
- // If the term couldn't be found and it's a property term then insert it
- // as a local term.
- if (!$cvterm_id) {
- $term = [
- 'id' => "local:$type",
- 'name' => $type,
- 'is_obsolete' => 0,
- 'cv_name' => $cv->getValue('name'),
- 'db_name' => 'local',
- 'is_relationship' => FALSE,
- ];
- $cvterm = (object) chado_insert_cvterm($term, ['update_existing' => FALSE]);
- $cvterm_id = $cvterm->cvterm_id;
- }
- if ($is_prop_type) {
- $this->featureprop_cvterm_lookup[strtolower($cvterm->name)] = $cvterm_id;
- $this->featureprop_cvterm_lookup[strtolower($type)] = $cvterm_id;
- }
- else {
- $this->feature_cvterm_lookup[strtolower($cvterm->name)] = $cvterm_id;
- $this->feature_cvterm_lookup[strtolower($type)] = $cvterm_id;
- }
- return $cvterm_id;
- }
- /**
- * Makes sure Chado is ready with the necessary synonym type records.
- */
- private function prepSynonms() {
- // make sure we have a 'synonym_type' vocabulary
- $select = ['name' => 'synonym_type'];
- $results = chado_select_record('cv', ['*'], $select);
- if (count($results) == 0) {
- // insert the 'synonym_type' vocabulary
- $values = [
- 'name' => 'synonym_type',
- 'definition' => 'vocabulary for synonym types',
- ];
- $success = chado_insert_record('cv', $values, array(
- 'skip_validation' => TRUE,
- ));
- if (!$success) {
- $this->logMessage("Failed to add the synonyms type vocabulary.", [], TRIPAL_WARNING);
- return 0;
- }
- // now that we've added the cv we need to get the record
- $results = chado_select_record('cv', ['*'], $select);
- if (count($results) > 0) {
- $syncv = $results[0];
- }
- }
- else {
- $syncv = $results[0];
- }
- // get the 'exact' cvterm, which is the type of synonym we're adding
- $select = [
- 'name' => 'exact',
- 'cv_id' => [
- 'name' => 'synonym_type',
- ],
- ];
- $result = chado_select_record('cvterm', ['*'], $select);
- if (count($result) == 0) {
- $term = [
- 'name' => 'exact',
- 'id' => "synonym_type:exact",
- 'definition' => '',
- 'is_obsolete' => 0,
- 'cv_name' => $syncv->name,
- 'is_relationship' => FALSE,
- ];
- $syntype = chado_insert_cvterm($term, ['update_existing' => TRUE]);
- if (!$syntype) {
- $this->logMessage("Cannot add synonym type: internal:$type.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- else {
- $syntype = $result[0];
- }
- $this->exact_syn = $syntype;
- }
- /**
- * Makes sure there is a null publication in the database.
- */
- private function prepNullPub(){
- // Check to see if we have a NULL publication in the pub table. If not,
- // then add one.
- $select = ['uniquename' => 'null'];
- $result = chado_select_record('pub', ['*'], $select);
- if (count($result) == 0) {
- $pub_sql = "
- INSERT INTO {pub} (uniquename,type_id)
- VALUES (:uname,
- (SELECT cvterm_id
- FROM {cvterm} CVT
- INNER JOIN {dbxref} DBX ON DBX.dbxref_id = CVT.dbxref_id
- INNER JOIN {db} DB ON DB.db_id = DBX.db_id
- WHERE CVT.name = :type_id))
- ";
- $status = chado_query($psql);
- if (!$status) {
- $this->logMessage("Cannot prepare statement 'ins_pub_uniquename_typeid.", [], TRIPAL_WARNING);
- return 0;
- }
- // Insert the null pub.
- $result = chado_query($pub_sql, [
- ':uname' => 'null',
- ':type_id' => 'null',
- ])->fetchObject();
- if (!$result) {
- $this->logMessage("Cannot add null publication needed for setup of alias.", [], TRIPAL_WARNING);
- return 0;
- }
- $result = chado_select_record('pub', ['*'], $select);
- $pub = $result[0];
- }
- else {
- $pub = $result[0];
- }
- $this->null_pub = $pub;
- }
- /**
- * Makes sure Chado is ready with the necessary DB records.
- */
- private function prepDBs() {
- // Get the list of database records that are needed by this GFF file. If
- // they do not exist then add them.
- $sql = "
- SELECT db_id
- FROM {db}
- WHERE name = :dbname";
- foreach (array_keys($this->db_lookup) as $dbname) {
- // First look for the database name if it doesn't exist then create one.
- // first check for the fully qualified URI (e.g. DB:<dbname>. If that
- // can't be found then look for the name as is. If it still can't be found
- // the create the database
- $values = ['name' => "DB:$dbname"];
- $db = chado_select_record('db', ['db_id'], $values);
- if (count($db) == 0) {
- $values = ['name' => "$dbname"];
- $db = chado_select_record('db', ['db_id'], $values);
- }
- if (count($db) == 0) {
- $values = [
- 'name' => $dbname,
- 'description' => 'Added automatically by the Triapl GFF loader.',
- ];
- $success = chado_insert_record('db', $values, array(
- 'skip_validation' => TRUE,
- ));
- if ($success) {
- $values = ['name' => "$dbname"];
- $db = chado_select_record('db', ['db_id'], $values);
- }
- else {
- $this->logMessage("Cannot find or add the database $dbname.", [], TRIPAL_WARNING);
- return 0;
- }
- }
- $this->db_lookup[$dbname] = $db[0]->db_id;
- }
- }
- /**
- * Parses the current line of the GFF3 file for a feature.
- *
- * @return array
- * An associative array containing the 9 elements othe GFF3 file. The
- * 9th element is an associative array of the attributes.
- */
- private function parseFeature($line) {
- $date = getdate();
- // get the columns
- $cols = explode("\t", $line);
- if (sizeof($cols) != 9) {
- throw new Exception(t('Improper number of columns on line %line_num: %line', ['%line_num' => $this->current_line, '%line' => $line]));
- }
- $ret = [
- 'line' => $this->current_line,
- 'landmark' => $cols[0],
- 'source' => $cols[1],
- 'type' => strtolower($cols[2]),
- 'start' => $cols[3],
- 'stop' => $cols[4],
- 'score' => $cols[5],
- 'strand' => $cols[6],
- 'phase' => $cols[7],
- 'attrs' => [],
- ];
- // Ready the start and stop for chado. Chado expects these positions
- // to be zero-based, so we substract 1 from the fmin. Also, in case
- // they are backwards, put them in the right order.
- $fmin = $ret['start'] - 1;
- $fmax = $ret['stop'];
- if ($ret['stop'] < $ret['start']) {
- $fmin = $ret['stop'] - 1;
- $fmax = $ret['start'];
- }
- $ret['start'] = $fmin;
- $ret['stop'] = $fmax;
- // Format the strand for chado
- if (strcmp($ret['strand'], '.') == 0) {
- $ret['strand'] = 0;
- }
- elseif (strcmp($ret['strand'], '+') == 0) {
- $ret['strand'] = 1;
- }
- elseif (strcmp($ret['strand'], '-') == 0) {
- $ret['strand'] = -1;
- }
- if (strcmp($ret['phase'], '.') == 0) {
- if ($ret['type'] == 'cds') {
- $ret['phase'] = '0';
- }
- else {
- $ret['phase'] = '';
- }
- }
- $tags = [];
- $attr_name = '';
- $attr_uniquename = '';
- $attrs = explode(";", $cols[8]);
- $attr_organism = $this->organism_id;
- $attr_parent = '';
- $attr_others = [];
- $attr_aliases = [];
- $attr_dbxref = [];
- $attr_derives = [];
- $attr_terms = [];
- $attr_target = [];
- foreach ($attrs as $attr) {
- $attr = rtrim($attr);
- $attr = ltrim($attr);
- if (strcmp($attr, '') == 0) {
- continue;
- }
- if (!preg_match('/^[^\=]+\=.+$/', $attr)) {
- throw new Exception(t('Attribute is not correctly formatted on line !line_num: !attr',
- ['!line_num' => $this->current_line, '!attr' => $attr]));
- }
- // Break apart each attribute into key/value pairs.
- $tag = preg_split("/=/", $attr, 2);
- // Multiple values of an attribute are separated by commas
- $tag_name = $tag[0];
- if (!array_key_exists($tag_name, $tags)) {
- $tags[$tag_name] = [];
- }
- $tags[$tag_name] = array_merge($tags[$tag_name], explode(",", $tag[1]));
- // Replace the URL escape codes for each tag
- for ($i = 0; $i < count($tags[$tag_name]); $i++) {
- $tags[$tag_name][$i] = urldecode($tags[$tag_name][$i]);
- }
- if (strcmp($tag_name, 'Alias') == 0) {
- $attr_aliases = array_merge($attr_aliases, $tags[$tag_name]);
- }
- elseif (strcmp($tag_name, 'Parent') == 0) {
- $attr_parent = urldecode($tag[1]);
- }
- elseif (strcmp($tag_name, 'Dbxref') == 0) {
- $attr_dbxref = array_merge($attr_dbxref, $tags[$tag_name]);
- }
- elseif (strcmp($tag_name, 'Derives_from') == 0) {
- $attr_derives = array_merge($attr_derives, $tags[$tag_name]);
- }
- elseif (strcmp($tag_name, 'Ontology_term') == 0) {
- $attr_terms = array_merge($attr_terms, $tags[$tag_name]);
- }
- elseif (strcmp($tag_name, 'organism') == 0) {
- if (count($tags[$tag_name]) > 1) {
- throw new Exception(t('Each feature can only have one "organism" attribute. The feature %uniquename has more than one: %organism',
- ['%uniquename' => $ret['uniquename'], '%organism' => $ret['organism']]));
- }
- $attr_organism = $this->findOrganism($tags[$tag_name][0], $this->current_line);
- }
- elseif (strcmp($tag_name, 'Target') == 0) {
- $matches = [];
- if (count($tags[$tag_name]) > 1) {
- throw new Exception(t('Each feature can only have one "Target" attribute. The feature %uniquename has more than one.',
- ['%uniquename' => $ret['uniquename']]));
- }
- if (preg_match('/^(.*?)\s+(\d+)\s+(\d+)(\s+[\+|\-])*$/', trim($tags[$tag_name][0]), $matches)) {
- $attr_target['name'] = $matches[1];
- $attr_target['start'] = $matches[2];
- $attr_target['stop'] = $matches[3];
- $tfmin = $attr_target['start'] - 1;
- $tfmax = $attr_target['stop'];
- if ($attr_target['stop'] < $attr_target['start']) {
- $tfmin = $attr_target['stop'] - 1;
- $tfmax = $attr_target['start'];
- }
- $attr_target['start'] = $tfmin;
- $attr_target['stop'] = $tfmax;
- $attr_target['phase'] = '';
- $attr_target['strand'] = 0;
- if (!empty($matches[4])) {
- if (preg_match('/^\+$/', trim($matches[4]))) {
- $attr_target['strand'] = 1;
- }
- elseif (preg_match('/^\-$/', trim($matches[4]))) {
- $attr_target['strand'] = -1;
- }
- }
- $attr_target['organism_id'] = $this->target_organism_id ? $this->target_organism_id : $this->organism_id;
- $attr_target['type_id'] = $this->target_type_id ? $this->target_type_id : NULL;
- }
- }
- elseif (strcmp($tag_name, 'target_organism') == 0) {
- $attr_target['organism_id'] = $this->findOrganism($tags[$tag_name][0], $this->current_line);
- }
- elseif (strcmp($tag_name, 'target_type') == 0) {
- $attr_target['type'] = $tags[$tag_name][0];
- }
- // Get the list of non-reserved attributes these will get added
- // as properties to the featureprop table. The 'Note', 'Gap', 'Is_Circular',
- // attributes will go in as a property so those are not in the list
- // checked below.
- elseif (strcmp($tag_name, 'Name') !=0 and strcmp($tag_name, 'ID') !=0 and
- strcmp($tag_name, 'Alias') != 0 and strcmp($tag_name, 'Parent') != 0 and
- strcmp($tag_name, 'Target') != 0 and strcmp($tag_name, 'Derives_from') != 0 and
- strcmp($tag_name, 'Dbxref') != 0 and strcmp($tag_name, 'Ontology_term') != 0 and
- strcmp($tag_name, 'target_organism') != 0 and strcmp($tag_name, 'target_type') != 0 and
- strcmp($tag_name, 'organism' != 0)) {
- foreach ($tags[$tag_name] as $value) {
- if (!array_key_exists($tag_name, $attr_others)) {
- $attr_others[$tag_name] = [];
- }
- $attr_others[$tag_name][] = $value;
- }
- }
- }
- // A feature may get ignored. But let's default this to FALSE.
- $ret['skipped'] = FALSE;
- // If neither name nor uniquename are provided then generate one.
- $names = $this->getFeatureName($tags, $ret['type'], $ret['landmark'], $fmin, $fmax);
- $attr_uniquename = $names['uniquename'];
- $attr_name = $names['name'];
- $ret['name'] = $attr_name;
- $ret['uniquename'] = $attr_uniquename;
- $ret['synonyms'] = $attr_aliases;
- // Add in the dbxref record.
- $ret['dbxrefs'] = [];
- foreach ($attr_dbxref as $key => $dbx) {
- $parts = explode(':', $dbx, 2);
- $ret['dbxrefs']["{$parts[0]}:{$parts[1]}"] = array(
- 'db' => $parts[0],
- 'accession' => $parts[1],
- );
- }
- // Add in the GFF source dbxref. This is needed for GBrowse.
- $ret['dbxrefs']["GFF_source:{$ret['source']}"] = array(
- 'db' => 'GFF_source',
- 'accession' => $ret['source'],
- );
- // Add in the ontology terms
- $ret['terms'] = [];
- foreach ($attr_terms as $key => $dbx) {
- $parts = explode(':', $dbx, 2);
- $ret['terms']["{$parts[0]}:{$parts[1]}"] = array(
- 'db' => $parts[0],
- 'accession' => $parts[1],
- );
- }
- // Add the derives from entry.
- $ret['derives_from'] = '';
- if (count($attr_derives) == 1) {
- $ret['derives_from'] = $attr_derives[0];
- }
- if (count($attr_derives) > 1) {
- throw new Exception(t('Each feature can only have one "Derives_from" attribute. The feature %uniquename has more than one: %derives',
- [
- '%uniquename' => $ret['uniquename'],
- '%derives' => $ret['derives_from'],
- ]));
- }
- // Now add all of the attributes into the return array.
- foreach ($tags as $key => $value) {
- $ret['attrs'][$key] = $value;
- }
- // Add the organism entry.
- $ret['organism'] = $attr_organism;
- if (!$ret['organism']) {
- $ret['skipped'] = TRUE;
- }
- // Add the target. If the type_id is missing then remove it and we'll
- // skip it.
- $ret['target'] = $attr_target;
- if (!$ret['target']['type']) {
- $ret['target'] = [];
- }
- // Add the properties and parent.
- $ret['properties'] = $attr_others;
- $ret['parent'] = $attr_parent;
- return $ret;
- }
- /**
- * Indexes the FASTA section of the file for quick lookup.
- */
- private function indexFASTA() {
- // Iterate through the remaining lines of the file
- while ($line = fgets($this->gff_file_h)) {
- $this->current_line++;
- $this->addItemsHandled(drupal_strlen($line));
- // Get the ID and the current file pointer and store that for later.
- if (preg_match('/^>/', $line)) {
- $id = preg_replace('/^>([^\s]+).*$/', '\1', $line);
- $this->residue_index[trim($id)] = ftell($this->gff_file_h);
- }
- }
- }
- /**
- * Loads the actual residue information from the FASTA section of the file.
- */
- private function insertFeatureSeqs() {
- $num_residues = count(array_keys($this->residue_index));
- $this->setItemsHandled(0);
- $this->setTotalItems($num_residues);
- $count = 0;
- foreach ($this->residue_index as $uniquename => $offset) {
- $is_landmark = FALSE;
- if (!(array_key_exists($uniquename, $this->features) and
- $this->features[$uniquename]) and
- !(array_key_exists($uniquename, $this->landmarks) and
- $this->landmarks[$uniquename])) {
- $this->logMessage('Cannot find feature to assign FASTA sequence: %uname.',
- ['%uname' => $uniquename], TRIPAL_WARNING);
- $count++;
- continue;
- }
- if (array_key_exists($uniquename, $this->features)) {
- $feature = $this->features[$uniquename];
- }
- else {
- $feature = $this->landmarks[$uniquename];
- $is_landmark = TRUE;
- }
- $this->ensureFeatureIsLoaded($feature);
- $id = $feature['feature_id'];
- $residues = [];
- fseek($this->gff_file_h, $offset);
- while ($line = fgets($this->gff_file_h)) {
- if (preg_match('/^>/', $line)) {
- break;
- }
- $residues[] = trim($line);
- }
- $residues = implode('', $residues);
- $feature['residues'] = $residues;
- if (!$is_landmark) {
- $this->features[$uniquename] = $feature['feature_id'];;
- }
- else {
- $this->landmarks[$uniquename] = $feature['feature_id'];;
- }
- chado_update_record('feature', ['feature_id' => $id], [
- 'residues' => $residues,
- 'seqlen' => strlen($residues),
- 'md5checksum' => md5($residues),
- ]);
- $count++;
- $this->setItemsHandled($count);
- }
- }
- /**
- * Retrieves a ChadoRecord object for the landmark feature.
- *
- * @param $landmark_name
- * The name of the landmark to get
- *
- * @return
- * A feature ChadoRecord object or NULL if the landmark is missing and
- * $skip_on_missing is TRUE.
- */
- private function findLandmark($landmark_name) {
- // Before performing a database query check to see if
- // this landmark is already in our lookup list.
- if (array_key_exists($landmark_name, $this->landmarks)) {
- return $this->landmarks[$landmark_name];
- }
- $landmark = new ChadoRecord('feature');
- $landmark->setValues([
- 'organism_id' => $this->organism_id,
- 'uniquename' => $landmark_name,
- ]);
- if ($landmark_type) {
- $landmark->setValue('type_id', $landmark_type->getValue('cvterm_id'));
- }
- $num_found = $landmark->find();
- if ($num_found == 0) {
- return NULL;
- }
- if ($num_found > 1) {
- throw new Exception(t("The landmark '%landmark' has more than one entry for this organism (%species). Did you provide a landmark type? If not, try resubmitting and providing a type." .
- "Cannot continue", [
- '%landmark' => $landmark_name,
- '%species' => $this->organism->getValues('genus') . " " . $this->organism->getValues('species'),
- ]));
- }
- // The landmark was found, remember it
- $this->landmarks[$landmark_name] = $landmark;
- return $landmark;
- }
- /**
- * Loads into the database any landmark sequences.
- *
- * @param $line
- * The line from the GFF file that is the ##sequence-region comment.
- */
- private function insertHeaderLandmark($line) {
- $region_matches = [];
- if (preg_match('/^##sequence-region\s+(\w*?)\s+(\d+)\s+(\d+)$/i', $line, $region_matches)) {
- $rid = $region_matches[1];
- $landmark = $this->findLandmark($rid);
- if (!$landmark) {
- $rstart = $region_matches[2];
- $rend = $region_matches[3];
- if (!$this->landmark_type) {
- throw new Exception(t('The landmark, !landmark, cannot be added becuase no landmark type was provided. Please redo the importer job and specify a landmark type.',
- ['!landmark' => $rid]));
- }
- $this->insertLandmark($rid);
- }
- }
- }
- /**
- * Loads a single landmark by name.
- */
- private function insertLandmark($name) {
- $feature = new ChadoRecord('feature');
- $residues = '';
- $feature->setValues([
- 'organism_id' => $this->organism->getValue('organism_id'),
- 'uniquename' => $name,
- 'name' => $name,
- 'type_id' => $this->landmark_cvterm->getValue('cvterm_id'),
- 'md5checksum' => md5($residues),
- 'is_analysis' => FALSE,
- 'is_obsolete' => FALSE,
- ]);
- $feature->insert();
- $this->landmarks[$name] = $feature->getID();
- }
- /**
- *
- */
- private function parseGFF3() {
- $filesize = filesize($this->gff_file);
- $this->setTotalItems($filesize);
- // Holds a unique list of cvterms for later lookup.
- $feature_cvterms = [];
- $featureprop_cvterms = [];
- while ($line = fgets($this->gff_file_h)) {
- $this->current_line++;
- $this->addItemsHandled(drupal_strlen($line));
- $line = trim($line);
- if ($this->current_line < $this->start_line) {
- continue;
- }
- // If we're in the FASTA file we're at the end of the features so return.
- if (preg_match('/^##FASTA/i', $line)) {
- $this->indexFASTA();
- continue;
- }
- // if at the ##sequence-region line handle it.
- $matches = [];
- if (preg_match('/^##sequence-region\s+(\w*?)\s+(\d+)\s+(\d+)$/i', $line, $matches)) {
- $this->seq_region_headers[$matches[1]] = $line;
- continue;
- }
- // skip comments
- if (preg_match('/^#/', $line)) {
- continue;
- }
- // skip empty lines
- if (preg_match('/^\s*$/', $line)) {
- continue;
- }
- // Parse this feature from this line of the GFF3 file.
- $gff_feature = $this->parseFeature($line);
- // Add the landmark if it doesn't exist in the landmark list.
- if (!array_key_exists($gff_feature['landmark'], $this->landmarks)) {
- $this->landmarks[$gff_feature['landmark']] = FALSE;
- }
- // Organize DBs and DBXrefs for faster access later on.
- foreach ($gff_feature['dbxrefs'] as $index => $info) {
- if (!array_key_exists($info['db'], $this->db_lookup)) {
- $this->db_lookup[$info['db']] = FALSE;
- }
- if (!array_key_exists($index, $this->dbxref_lookup)) {
- $this->dbxref_lookup[$index] = $info;
- }
- }
- // We want to make sure the Ontology_term attribute dbxrefs are
- // also easily looked up... but we do not want to create them
- // if they do not exist the precense of the 'cvterm' key will
- // tell the loadDbxrefs() function to not create the term.
- foreach ($gff_feature['terms'] as $index => $info) {
- if (!array_key_exists($info['db'], $this->db_lookup)) {
- $this->db_lookup[$info['db']] = FALSE;
- }
- if (!array_key_exists($index, $this->dbxref_lookup)) {
- $this->dbxref_lookup[$index] = $info;
- $this->dbxref_lookup[$index]['cvterm_id'] = NULL;
- }
- }
- // Organize the CVterms for faster access later on.
- if (!array_key_exists($gff_feature['type'], $feature_cvterms)) {
- $feature_cvterms[$gff_feature['type']] = 0;
- }
- $feature_cvterms[$gff_feature['type']]++;
- // Add any target feature types to the list as well.
- if (array_key_exists('name', $gff_feature['target'])) {
- if (!array_key_exists($gff_feature['target']['type'], $feature_cvterms)) {
- $feature_cvterms[$gff_feature['target']['type']] = 0;
- }
- $feature_cvterms[$gff_feature['target']['type']]++;
- }
- // Organize the feature property types for faster access later on.
- foreach ($gff_feature['properties'] as $prop_name => $value) {
- if (!array_key_exists($prop_name, $featureprop_cvterms)) {
- $featureprop_cvterms[$prop_name] = NULL;
- }
- $featureprop_cvterms[$prop_name]++;
- }
- // Cache the GFF feature details for later lookup.
- if ($gff_feature['uniquename'] != $gff_feature['landmark']) {
- $this->cacheFeature($gff_feature);
- }
- // If this feature has a target then we need to add the target as
- // new feature for insertion.
- if (array_key_exists('name', $gff_feature['target'])) {
- $this->addTargetFeature($gff_feature);
- }
- }
- // Make sure we have the protein term in our list.
- if (!array_key_exists('protein', $feature_cvterms) and
- !array_key_exists('polypeptide', $feature_cvterms)) {
- $feature_cvterms['polypeptide'] = 0;
- }
- // Iterate through the feature type terms and get a chado object for each.
- foreach (array_keys($feature_cvterms) as $name) {
- $this->getTypeID($name, FALSE);
- }
- // Iterate through the featureprop type terms and get a cvterm_id for
- // each. If it doesn't exist then add a new record.
- foreach (array_keys($featureprop_cvterms) as $name) {
- $this->getTypeID($name, TRUE);
- }
- // Finally, add any protein features that need to be created.
- $this->addProteinFeatures();
- }
- /**
- * Checks the features and finds those that need proteins added.
- */
- private function addProteinFeatures() {
- // Don't do anything if the user wants to skip creation of non listed
- // proteins. Proteins that have actual lines in the GFF will still be
- // created.
- if ($this->skip_protein) {
- $this->logMessage(' Skipping creation of non-specified proteins...');
- return;
- }
- $proteins = [];
- // First, store records for which proteins need to exist. These
- // will be for any parent that has a 'CDS' or 'protein' child.
- foreach ($this->features as $info) {
- $findex = $info['findex'];
- $feature = $this->getCachedFeature($findex);
- $type = $feature['type'];
- if ($type == 'cds' or $type == 'protein' or $type == 'polypeptide') {
- $parent_name = $feature['parent'];
- if ($parent_name) {
- if (!array_key_exists($parent_name, $proteins)) {
- $proteins[$parent_name] = [];
- }
- if ($type == 'cds') {
- $proteins[$parent_name]['cds'][] = $findex;
- }
- if ($type == 'protein' or $type == 'polypeptide') {
- $proteins[$parent_name]['protein'] = $findex;
- }
- }
- }
- }
- // Second, iterate through the protein list and for any parents that
- // don't already have a protein we need to create one.
- foreach ($proteins as $parent_name => $info) {
- // Skip addition of any proteins that are already in the GFF file.
- if (array_key_exists('protein', $info)) {
- continue;
- }
- // If we don't have a protein
- if (array_key_exists('cds', $info)) {
- $start = INF;
- $stop = -INF;
- $start_phase = 0;
- $stop_phase = 0;
- // Find the starting and end CDS.
- foreach ($info['cds'] as $findex) {
- $cds = $this->getCachedFeature($findex);
- if ($cds['start'] < $start) {
- $start = $cds['start'];
- $start_phase = $cds['phase'];
- }
- if ($cds['stop'] > $stop) {
- $stop = $cds['stop'];
- $stop_phase = $cds['phase'];
- }
- }
- // Set the start of the protein to be the start of the coding
- // sequence minus the phase.
- if ($cds['strand'] == '-1') {
- $stop -= $stop_phase;
- }
- else {
- $start += $start_phase;
- }
- // Get the name for the protein
- $name = $parent_name;
- if ($this->re_mrna and $this->re_protein) {
- // We use a regex to generate protein name from parent name
- $uname = preg_replace("/$this->re_mrna/", $this->re_protein, $parent_name);
- }
- else {
- // No regex, use the default '-protein' suffix
- $uname = $parent_name . '-protein';
- }
- // Now create the protein feature.
- $feature = [
- 'line' => $cds['line'],
- 'landmark' => $cds['landmark'],
- 'source' => $cds['source'],
- 'type' => 'polypeptide',
- 'start' => $start,
- 'stop' => $stop,
- 'strand' => $cds['strand'],
- 'phase' => '',
- 'attr' => [],
- 'skipped' => FALSE,
- 'name' => $name,
- 'uniquename' => $uname,
- 'synonyms' => [],
- 'dbxrefs' => [],
- 'terms' => [],
- 'derives_from' => NULL,
- 'organism' => $cds['organism_id'],
- 'target' => [],
- 'properties' => [],
- 'parent' => $cds['parent'],
- ];
- $this->cacheFeature($feature);
- }
- }
- }
- /**
- * Adds a new target feature to the feature list.
- *
- * @param $gff_feature
- * The feature array created by the parseFeature function.
- */
- private function addTargetFeature($gff_feature) {
- if (!array_key_exists($gff_feature['target']['name'], $this->features)) {
- $feature = [
- 'is_target' => TRUE,
- 'line' => $this->current_line,
- 'landmark' => $gff_feature['landmark'],
- 'source' => $gff_feature['source'],
- 'type' => $gff_feature['target']['type'],
- 'start' => $gff_feature['target']['start'],
- 'stop' => $gff_feature['target']['stop'],
- 'strand' => $gff_feature['target']['strand'],
- 'phase' => $gff_feature['target']['phase'],
- 'attr' => [],
- 'skipped' => FALSE,
- 'name' => $gff_feature['target']['name'],
- 'uniquename' => $gff_feature['target']['name'],
- 'synonyms' => [],
- 'dbxrefs' => [],
- 'terms' => [],
- 'derives_from' => NULL,
- 'organism' => $gff_feature['target']['organism_id'],
- 'target' => [],
- 'properties' => [],
- 'parent' => '',
- ];
- $this->cacheFeature($feature);
- }
- }
- /**
- * Opens the cache file for read/write access.
- */
- private function openCacheFile() {
- $temp_file = drupal_tempnam('temporary://', "TripalGFF3Import_");
- $this->gff_cache_file_name = drupal_realpath($temp_file);
- $this->logMessage("Opening temporary cache file: !cfile",
- ['!cfile' => $this->gff_cache_file_name]);
- $this->gff_cache_file = fopen($this->gff_cache_file_name, "r+");
- }
- /**
- * Closes and cleans up the cache file.
- */
- private function closeCacheFile() {
- fclose($this->gff_cache_file);
- $this->logMessage("Removing temporary cache file: !cfile",
- ['!cfile' => $this->gff_cache_file_name]);
- unlink($this->gff_cache_file_name);
- }
- /**
- * Caches the processed feature from a GFF3 file
- */
- private function cacheFeature($gff_feature) {
- $findex = ftell($this->gff_cache_file);
- fwrite($this->gff_cache_file, serialize($gff_feature) . "\n");
- $this->features[$gff_feature['uniquename']]['findex'] = $findex;
- $this->features[$gff_feature['uniquename']]['feature_id'] = NULL;
- }
- /**
- * Retrieves a feature using its index from the cache file.
- */
- private function getCachedFeature($findex) {
- fseek($this->gff_cache_file, $findex);
- $feature = fgets($this->gff_cache_file);
- $feature = unserialize($feature);
- return $feature;
- }
- /**
- * Imports the landmark features into Chado.
- */
- private function insertLandmarks() {
- foreach ($this->landmarks as $uniquename => $feature_id) {
- // If the landmark does not have an entry in the GFF lines, try to
- // find or add it.
- if ($feature_id === FALSE) {
- // First see if there is a definition in the headers region.
- if (array_key_exists($uniquename, $this->seq_region_headers)) {
- $this->insertHeaderLandmark($this->seq_region_headers[$uniquename]);
- }
- // Second, if a landmark_type is provided then just add the landmark feature.
- else if ($this->landmark_type) {
- $this->insertLandmark($uniquename);
- }
- else {
- throw new Exception(t('The landmark (reference) sequence, !landmark, is not in the database and not specified in the GFF3 file. Please either pre-load the landmark sequences or set a "Landmark Type" in the GFF importer.',
- ['!landmark' => $uniquename]));
- }
- }
- }
- }
- /**
- * Imports the feature records into Chado.
- */
- private function insertFeatures() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "
- INSERT INTO {feature}
- (uniquename, name, type_id, organism_id, residues, md5checksum,
- seqlen, is_analysis, is_obsolete)
- VALUES\n";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- $i++;
- // Only do an insert if this feature doesn't already exist in the databse.
- if (!$feature_id and !$feature['skipped']) {
- $residues = $this->getResidues($feature, FALSE);
- $type_id = $this->feature_cvterm_lookup[$feature['type']];
- $sql .= "(:uniquename_$i, :name_$i, :type_id_$i, :organism_id_$i, :residues_$i, " .
- " :md5checksum_$i, :seqlen_$i, FALSE, FALSE),\n";
- $args[":uniquename_$i"] = $uniquename;
- $args[":name_$i"] = $feature['name'];
- $args[":type_id_$i"] = $type_id;
- $args[":organism_id_$i"] = $feature['organism'] ? $feature['organism'] : $this->organism->getID();
- $args[":residues_$i"] = $residues;
- $args[":md5checksum_$i"] = $residues ? md5($residues) : '';
- $args[":seqlen_$i"] = strlen($residues);
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $args = [];
- }
- }
- }
- /**
- * Check if the features exist in the database.
- */
- private function findFeatures() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $sql = "SELECT uniquename, type_id, organism_id, feature_id FROM {feature} WHERE uniquename in (:uniquenames)";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $names = [];
- foreach ($this->features as $uniquename => $info) {
- $feature_id = $info['feature_id'];
- $total++;
- if (!$feature_id) {
- $i++;
- $names[] = $uniquename;
- }
- // If we've reached the size of the batch then let's do the select.
- if ($i == $batch_size or $total == $num_features) {
- if (count($names) > 0) {
- $args = [':uniquenames' => $names];
- $results = chado_query($sql, $args);
- while ($f = $results->fetchObject()) {
- $matched_findex = $this->features[$f->uniquename]['findex'];
- $matched_feature = $this->getCachedFeature($matched_findex);
- $matched_type_id = $this->feature_cvterm_lookup[$matched_feature['type']];
- $matched_organism_id = $matched_feature['organism'] ? $matched_feature['organism'] : $this->organism->getID();
- if ($matched_type_id == $f->type_id and $matched_organism_id == $f->organism_id) {
- $this->features[$f->uniquename]['feature_id'] = $f->feature_id;
- }
- }
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $i = 0;
- $names = [];
- }
- }
- }
- /**
- * Deletes all anciallary data about a feature so we can re-insert it.
- */
- private function deleteFeatureData() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $sql1 = "DELETE from {featureprop} WHERE feature_id IN (:feature_ids)";
- $sql2 = "DELETE from {featureloc} WHERE feature_id IN (:feature_ids)";
- $sql3 = "DELETE from {feature_cvterm} WHERE feature_id IN (:feature_ids)";
- $sql4 = "DELETE from {feature_dbxref} WHERE feature_id IN (:feature_ids)";
- $sql5 = "DELETE from {feature_synonym} WHERE feature_id IN (:feature_ids)";
- $sql6 = "DELETE from {feature_relationship} WHERE subject_id IN (:feature_ids)";
- $sql7 = "DELETE from {analysisfeature} WHERE feature_id IN (:feature_ids)";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $feature_ids = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- $i++;
- if ($feature_id and !$feature['skipped']) {
- $feature_ids[] = $feature_id;
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($feature_ids) > 0) {
- $args[':feature_ids'] = $feature_ids;
- chado_query($sql1, $args);
- chado_query($sql2, $args);
- chado_query($sql3, $args);
- chado_query($sql4, $args);
- chado_query($sql5, $args);
- chado_query($sql6, $args);
- chado_query($sql7, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $i = 0;
- $feature_ids = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureProps(){
- $batch_size = 100;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "INSERT INTO {featureprop} (feature_id, type_id, value, rank) VALUES\n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- // If the feature is not skipped
- if (!$feature['skipped']) {
- $i++;
- // Iterate through all of the properties of this feature.
- foreach ($feature['properties'] as $prop_name => $values) {
- foreach ($values as $rank => $value) {
- $j++;
- $type_id = $this->featureprop_cvterm_lookup[strtolower($prop_name)];
- $sql .= "(:feature_id_$j, :type_id_$j, :value_$j, :rank_$j),\n";
- $args[":feature_id_$j"] = $feature_id;
- $args[":type_id_$j"] = $type_id;
- $args[":value_$j"] = $value;
- $args[":rank_$j"] = $rank;
- }
- }
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureParents(){
- $batch_size = 100;
- $num_parents = count(array_keys($this->parent_lookup));
- $num_batches = (int) ($num_parents / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- // Get the 'part_of' and 'derives_from cvterm.
- $part_of = $this->getTypeID('part_of', FALSE);
- $derives_from = $this->getTypeID('derives_from', FALSE);
- $init_sql = "INSERT INTO {feature_relationship} (subject_id, object_id, type_id, rank) VALUES\n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->parent_lookup as $parent => $children) {
- $total++;
- $i++;
- $parent_feature = $this->getCachedFeature($this->features[$parent]['findex']);
- $parent_uniquename = $parent_feature['uniquename'];
- $parent_feature_id = $this->features[$parent_uniquename]['feature_id'];
- if (!$parent_feature['skipped']) {
- $rank = 0;
- foreach ($children as $child_findex) {
- $j++;
- $child_feature = $this->getCachedFeature($child_findex);
- $child_uniquename = $child_feature['uniquename'];
- $child_feature_id = $this->features[$child_uniquename]['feature_id'];
- $type_id = $part_of;
- if ($child_feature['type'] == 'polypeptide' or $child_feature['type'] == 'protein') {
- $type_id = $derives_from;
- }
- $sql .= "(:subject_id_$j, :object_id_$j, :type_id_$j, :rank_$j),\n";
- $args[":subject_id_$j"] = $child_feature_id;
- $args[":object_id_$j"] = $parent_feature_id;
- $args[":type_id_$j"] = $type_id;
- $args[":rank_$j"] = $rank;
- $rank++;
- }
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_parents) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function findDbxrefs() {
- $batch_size = 1000;
- $num_dbxrefs = count(array_keys($this->dbxref_lookup));
- $num_batches = (int) ($num_dbxrefs / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- // DBXrefs may be already present so we'll do an initial round of
- // looking for them and then insert those that don't exist.
- $init_sql = "
- SELECT DB.name, DBX.db_id, DBX.accession, DBX.dbxref_id, CVT.cvterm_id
- FROM {dbxref} DBX
- INNER JOIN {db} DB on DB.db_id = DBX.db_id
- LEFT JOIN {cvterm} CVT on DBX.dbxref_id = CVT.dbxref_id
- WHERE
- ";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->dbxref_lookup as $xref => $info) {
- $i++;
- $total++;
- $sql .= "(DBX.accession = :accession_$i and DBX.db_id = :db_id_$i) OR\n";
- $args[":accession_$i"] = $info['accession'];
- $args[":db_id_$i"] = $this->db_lookup[$info['db']];
- // If we've reached the size of the batch then let's do the select.
- if ($i == $batch_size or $total == $num_dbxrefs) {
- $sql = rtrim($sql, " OR\n");
- $sql = $init_sql . $sql;
- $results = chado_query($sql, $args);
- while ($dbxref = $results->fetchObject()) {
- $index = $dbxref->name . ':' . $dbxref->accession;
- $this->dbxref_lookup[$index]['dbxref_id'] = $dbxref->dbxref_id;
- if ($dbxref->cvterm_id) {
- $this->cvterm_lookup[$index] = $dbxref->cvterm_id;
- $this->dbxref_lookup[$index]['cvterm_id'] = $dbxref->cvterm_id;
- }
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- * Calculates ranks for all of the children of each feature.
- *
- * This function should not be executed until after features are loaded
- * into the database and we have feature_ids for all of them.
- */
- private function findChildRanks() {
- // Iterate through parent-child relationships and set the ranks.
- foreach ($this->features as $uniquename => $info) {
- $feature = $this->getCachedFeature($info['findex']);
- if ($feature['parent']) {
- // place features in order that they appear by their start coordinates.
- $parent = $feature['parent'];
- $start = $feature['start'];
- $this->parent_lookup[$parent][$start] = $info['findex'];
- }
- }
- }
- /**
- *
- */
- private function findLandmarks() {
- $batch_size = 1000;
- $num_landmarks = count(array_keys($this->landmarks));
- $num_batches = (int) ($num_landmarks / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $sql = "SELECT name, uniquename, feature_id FROM {feature} WHERE uniquename in (:landmarks)";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $names = [];
- foreach ($this->landmarks as $landmark_name => $feature_id) {
- $i++;
- $total++;
- // Only do an insert if this dbxref doesn't already exist in the databse.
- // and this dbxref is from a Dbxref attribute not an Ontology_term attr.
- if (!$feature_id) {
- $names[] = $landmark_name;
- }
- // If we've reached the size of the batch then let's do the select.
- if ($i == $batch_size or $total == $num_landmarks) {
- if (count($names) > 0) {
- $args = [':landmarks' => $names];
- $results = chado_query($sql, $args);
- while ($f = $results->fetchObject()) {
- $this->landmarks[$f->uniquename] = $f->feature_id;
- }
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $i = 0;
- $j = 0;
- $names = [];
- }
- }
- }
- /**
- *
- */
- private function insertDbxrefs() {
- $batch_size = 1000;
- $num_dbxrefs = count(array_keys($this->dbxref_lookup));
- $num_batches = (int) ($num_dbxrefs / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "INSERT INTO {dbxref} (db_id, accession) VALUES\n";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->dbxref_lookup as $info) {
- $i++;
- $total++;
- // Only do an insert if this dbxref doesn't already exist in the databse.
- // and this dbxref is from a Dbxref attribute not an Ontology_term attr.
- if (!array_key_exists('dbxref_id', $info) and
- !array_key_exists('cvterm_id', $info)) {
- $sql .= "(:db_id_$i, :accession_$i),\n";
- $args[":db_id_$i"] = $this->db_lookup[$info['db']];
- $args[":accession_$i"] = $info['accession'];
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_dbxrefs) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureDbxrefs() {
- $batch_size = 100;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- // Don't need to use placeholders for this insert since we are only using integers.
- $init_sql = "INSERT INTO {feature_dbxref} (feature_id, dbxref_id) VALUES \n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- // If the feature is not skipped
- if (!$feature['skipped']) {
- $i++;
- // Iterate through all of the dbxrefs of this feature.
- foreach ($feature['dbxrefs'] as $index => $details) {
- $j++;
- $sql .= "(:feature_id_$j, :dbxref_id_$j),\n";
- $args[":feature_id_$j"] = $feature_id;
- $args[":dbxref_id_$j"] = $this->dbxref_lookup[$index]['dbxref_id'];
- }
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureCVterms() {
- $batch_size = 100;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- // Don't need to use placeholders for this insert since we are only using integers.
- $init_sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id) VALUES \n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- // If the feature is not skipped
- if (!$feature['skipped']) {
- $i++;
- // Iterate through all of the dbxrefs of this feature.
- foreach ($feature['terms'] as $index => $info) {
- $j++;
- $sql .= "(:feature_id_$j, :cvterm_id_$j, :pub_id_$j),\n";
- $args[":feature_id_$j"] = $feature_id;
- $args[":cvterm_id_$j"] = $this->cvterm_lookup[$index];
- $args[":pub_id_$j"] = $this->null_pub->pub_id;
- }
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureTargets() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "
- INSERT INTO {featureloc}
- (srcfeature_id, feature_id, fmin, fmax, strand, phase, rank)
- VALUES\n";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- $i++;
- // If the feature is not skipped and has a target then insert the
- // target alignment.
- if (!$feature['skipped'] and array_key_exists('name', $feature['target'])) {
- $tname = $feature['target']['name'];
- $tfindex = $this->features[$tname]['findex'];
- $tfeature_id = $this->features[$tname]['feature_id'];
- $target = $this->getCachedFeature($tfindex);
- // According to the Chado instructions for rank, the feature aligned
- // to the landmark will have a rank of 0. The feature aligned to the
- // target match will have a rank of 1.
- $rank = 1;
- $sql .= "(:srcfeature_id_$i, :feature_id_$i, :fmin_$i, :fmax_$i," .
- " :strand_$i, :phase_$i, :rank_$i),\n";
- $args[":srcfeature_id_$i"] = $tfeature_id;
- $args[":feature_id_$i"] = $feature_id;
- $args[":fmin_$i"] = $target['start'];
- $args[":fmax_$i"] = $target['stop'];
- $args[":strand_$i"] = $target['strand'];
- $args[":phase_$i"] = $target['phase'] ? $target['phase'] : NULL;
- $args[":rank_$i"] = $rank;
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureDerivesFrom() {
- $batch_size = 100;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- // Get the 'derives_from' cvterm
- $type_id = $this->getTypeID('derives_from', FALSE);
- $init_sql = "INSERT INTO {feature_relationship} (subject_id, object_id, type_id, rank) VALUES\n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- $i++;
- // If the feature is not skipped
- if (!$feature['skipped'] and $feature['derives_from']) {
- $object_id = $this->features[$feature['derives_from']]['feature_id'];
- $sql .= "(:subject_id_$i, :object_id_$i, :type_id_$i, 0),\n";
- $args[":subject_id_$i"] = $feature_id;
- $args[":object_id_$i"] = $object_id;
- $args[":type_id_$i"] = $type_id;
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- *
- */
- private function insertFeatureLocs() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "
- INSERT INTO {featureloc}
- (srcfeature_id, feature_id, fmin, fmax, strand, phase, rank)
- VALUES\n";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->features as $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- $i++;
- // If the feature is not skipped and is not a match "target".
- if (!$feature['skipped'] and $feature['is_target'] == FALSE) {
- // Get the rank of this feature by iterating through all siblings of the
- // parent and finding where this feature is in terms of start position.
- $rank = 0;
- if (array_key_exists('parent', $feature)) {
- $children_start = $this->parent_lookup[$feature['parent']];
- if (is_array($children_start)) {
- foreach (array_keys($children_start) as $sib_start) {
- if ($sib_start == $feature['start']) {
- break;
- }
- $rank++;
- }
- }
- }
- $sql .= "(:srcfeature_id_$i, :feature_id_$i, :fmin_$i, :fmax_$i," .
- " :strand_$i, :phase_$i, :rank_$i),\n";
- $args[":srcfeature_id_$i"] = $this->landmarks[$feature['landmark']];
- $args[":feature_id_$i"] = $feature_id;
- $args[":fmin_$i"] = $feature['start'];
- $args[":fmax_$i"] = $feature['stop'];
- $args[":strand_$i"] = $feature['strand'];
- $args[":phase_$i"] = $feature['phase'] ? $feature['phase'] : NULL;
- $args[":rank_$i"] = $rank;
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $args = [];
- }
- }
- }
- /**
- * Finds an organism from an organism attribute value.
- */
- private function findOrganism($organism_attr, $line_num) {
- if (array_key_exists($organism_attr, $this->organism_lookup)) {
- return $this->organism_lookup[$organism_attr];
- }
- // Get the organism object.
- [$genus, $species] = explode(':', $organism_attr, 2);
- $organism = new ChadoRecord('organism');
- $organism->setValues([
- 'genus' => $genus,
- 'species' => $species
- ]);
- $num_found = $organism->find();
- if ($num_found == 1){
- $this->organism_lookup[$organism_attr] = $organism->getID();
- return $organism->getID();
- }
- if ($num_found > 1) {
- throw new Exception(t('Multiple organisms were found for the "organism" attribute, %organism, on line %line_num',
- ['%organism' => $organism_attr, '%line_num' => $line_num]));
- }
- if ($this->create_organism) {
- $organism->insert();
- $this->organism_lookup[$organism_attr] = $organism->getID();
- $gff_feature['organism'] = $organism->getID();
- return $organism->getID();
- }
- return NULL;
- }
- /**
- *
- */
- private function findSynonyms() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "SELECT synonym_id, name FROM {synonym} WHERE \n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- $batch_synonyms = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature = $this->getCachedFeature($findex);
- $i++;
- $total++;
- // Get all of the synonyms for this batch.
- if (array_key_exists('synonyms', $feature)) {
- foreach ($feature['synonyms'] as $index => $synonym) {
- $batch_synonyms[] = $synonym;
- }
- }
- // If we've reached the size of the batch then let's do the select
- if ($i == $batch_size or $total == $num_features) {
- $batch_synonyms = array_unique($batch_synonyms);
- foreach ($batch_synonyms as $synonym) {
- $j++;
- if (!array_key_exists($synonym, $this->synonym_lookup)) {
- $this->synonym_lookup[$synonym] = NULL;
- }
- if (!$this->synonym_lookup[$synonym]) {
- $sql .= "(type_id = :type_id_$j AND name = :name_$j) OR\n";
- $args[":type_id_$j"] = $this->exact_syn->cvterm_id;
- $args[":name_$j"] = $synonym;
- }
- }
- if (count($args) > 0) {
- $sql = rtrim($sql, " OR\n");
- $sql = $init_sql . $sql;
- $results = chado_query($sql, $args);
- while ($synonym = $results->fetchObject()) {
- $this->synonym_lookup[$synonym->name] = $synonym->synonym_id;
- }
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- $batch_synonyms = [];
- }
- }
- }
- /**
- *
- */
- private function insertSynonyms() {
- $batch_size = 1000;
- $num_synonyms = count(array_keys($this->synonym_lookup));
- $num_batches = (int) ($num_synonyms / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "INSERT INTO {synonym} (type_id, name, synonym_sgml) VALUES\n";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $sql = '';
- $args = [];
- foreach ($this->synonym_lookup as $synonym => $synonym_id) {
- $i++;
- $total++;
- // Only do an insert if this dbxref doesn't already exist in the databse.
- if (!$synonym_id) {
- $sql .= "(:type_id_$i,:name_$i, ''),\n";
- $args[":type_id_$i"] = $this->exact_syn->cvterm_id;
- $args[":name_$i"] = $synonym;
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_synonyms) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $args = [];
- }
- }
- // Now we need to retrieve the synonyms IDs.
- $this->findSynonyms();
- }
- /**
- *
- */
- private function insertFeatureSynonyms(){
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "INSERT INTO {feature_synonym} (synonym_id, feature_id, pub_id) VALUES \n";
- $i = 0;
- $j = 0;
- $total = 0;
- $batch_num = 1;
- $args = [];
- foreach ($this->features as $uniquename => $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $total++;
- // If the feature is not skipped
- if (!$feature['skipped']) {
- $i++;
- // Handle all of the synonyms for this feature.
- foreach (array_unique($feature['synonyms']) as $synonym) {
- $j++;
- $sql .= "(:synonym_id_$j, :feature_id_$j, :pub_id_$j),\n";
- $args[":synonym_id_$j"] = $this->synonym_lookup[$synonym];
- $args[":feature_id_$j"] = $feature_id;
- $args[":pub_id_$j"] = $this->null_pub->pub_id;
- }
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $j = 0;
- $args = [];
- }
- }
- }
- /**
- * Retrieves the residues for a given feature.
- *
- */
- private function getResidues($feature, $is_landmark = FALSE) {
- return '';
- }
- /**
- * Determines the name for a feature using the ID and name attributes.
- *
- * @param $feature_attrs
- * The associative array of attributes for the feature.
- *
- * @param $type
- * The type of feature.
- *
- * @return array
- * An associative array with 'uniquename' and 'name' keys.
- */
- private function getFeatureName($attrs, $type, $landmark_name, $fmin, $fmax) {
- $uniquename = '';
- $name = '';
- // If there is no ID or name then try to create a name and ID.
- if (!array_key_exists('ID', $attrs) and !array_key_exists('name', $attrs)) {
- // Check if an alternate ID field is suggested, if so, then use
- // that for the name.
- if (array_key_exists($this->alt_id_attr, $attrs)) {
- $uniquename = $attrs[$this->alt_id_attr][0];
- $name = $uniquename;
- }
- // If the row has a parent then generate a unqiue ID
- elseif (array_key_exists('Parent', $attrs)) {
- $uniquename = $attrs['Parent'][0] . "-" . $type . "-" .
- $landmark_name . ":" . ($fmin + 1) . ".." . $fmax;
- $name = $attrs['Parent'][0] . "-" . $type;
- }
- // Generate a unique name based on the type and location
- // and set the name to simply be the type.
- else {
- $uniquename = $type . "-" . $landmark_name . ":" . ($fmin + 1) . ".." . $fmax;
- $name = $type . "-" . $landmark_name;
- }
- }
- elseif (!array_key_exists('Name', $attrs)) {
- $uniquename = $attrs['ID'][0];
- $name = $attrs['ID'][0];
- }
- elseif (!array_key_exists('ID', $attrs)) {
- $uniquename = $attrs['Name'][0];
- $name = $attrs['Name'][0];
- }
- else {
- $uniquename = $attrs['ID'][0];
- $name = $attrs['Name'][0];
- }
- // Does this uniquename already exist?
- if (array_key_exists($uniquename, $this->features)) {
- $prev_feature = $this->getCachedFeature($this->features[$uniquename]['findex']);
- // A name can be duplicated for subfeatures (e.g. CDS features)
- // that have the same parent but are really all the same thing.
- if (array_key_exists('Parent', $attrs)) {
- // Iterate through the list of similar IDs and see how many we have
- // then add a numeric suffix.
- $i = 2;
- while (array_key_exists($uniquename . "_" . $i, $this->features)) {
- $i++;
- }
- $uniquename = $uniquename . "_" . $i;
- }
- // A name can be duplicated if there is a target match alignment and
- // the feature appears first in the GFF as a target before it appears
- // on it's own independent line of the gff file.
- elseif ($prev_feature['is_target'] == TRUE) {
- // Do nothing, the previous feature is a target so we'll overwrite
- // it with this record.
- }
- else {
- throw new Exception(t("A feature with the same ID exists multiple times: !uname", ['!uname' => $uniquename]));
- }
- }
- return [
- 'name' => $name,
- 'uniquename' => $uniquename,
- ];
- }
- /**
- * Load the derives from attribute for a gff3 feature
- *
- * @param $feature
- * @param $subject
- * @param $organism
- *
- * @ingroup gff3_loader
- */
- private function loadDerivesFromOld($feature, $cvterm, $object,
- $organism, $fmin, $fmax) {
- $type = $cvterm->name;
- $derivesfrom_term = $this->getCvterm('derives_from');
- // First look for the object feature in the temp table to get it's type.
- $values = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $object,
- ];
- $result = chado_select_record('tripal_gff_temp', ['type_name'], $values);
- $type_id = NULL;
- if (count($result) > 0) {
- $type_id = $this->getCvterm($result[0]->type_name)->cvterm_id ?? NULL;
- }
- // If the object wasn't in the temp table then look for it in the
- // feature table and get it's type.
- if (!$type_id) {
- $result = chado_select_record('feature', ['type_id'], $values);
- if (count($result) > 1) {
- $this->logMessage("Cannot find feature type for, '!subject' , in 'derives_from' relationship. Multiple matching features exist with this uniquename.",
- ['!subject' => $object], TRIPAL_WARNING);
- return;
- }
- else {
- if (count($result) == 0) {
- $this->logMessage("Cannot find feature type for, '!subject' , in 'derives_from' relationship.",
- ['!subject' => $object], TRIPAL_WARNING);
- return '';
- }
- else {
- $type_id = $result->type_id;
- }
- }
- }
- // Get the object feature.
- $match = [
- 'organism_id' => $organism->organism_id,
- 'uniquename' => $object,
- 'type_id' => $type_id,
- ];
- $ofeature = chado_select_record('feature', ['feature_id'], $match);
- if (count($ofeature) == 0) {
- $this->logMessage("Could not add 'Derives_from' relationship " .
- "for %uniquename and %subject. Subject feature, '%subject', " .
- "cannot be found.", [
- '%uniquename' => $feature->getValue('uniquename'),
- '%subject' => $subject,
- ], TRIPAL_ERROR);
- return;
- }
- // If this feature is a protein then add it to the tripal_gffprotein_temp.
- if ($type == 'protein' or $type == 'polypeptide') {
- $values = [
- 'feature_id' => $feature->getID(),
- 'parent_id' => $ofeature[0]->feature_id,
- 'fmin' => $fmin,
- 'fmax' => $fmax,
- ];
- $result = chado_insert_record('tripal_gffprotein_temp', $values);
- if (!$result) {
- throw new Exception(t("Cound not save record in temporary protein table, Cannot continue.", []));
- }
- }
- // Now check to see if the relationship already exists. If it does
- // then just return.
- $values = [
- 'object_id' => $ofeature[0]->feature_id,
- 'subject_id' => $feature->getID(),
- 'type_id' => $derivesfrom_term->cvterm_id,
- 'rank' => 0,
- ];
- $rel = chado_select_record('feature_relationship', ['*'], $values);
- if (count($rel) > 0) {
- return;
- }
- // finally insert the relationship if it doesn't exist
- $ret = chado_insert_record('feature_relationship', $values, array(
- 'skip_validation' => TRUE,
- ));
- if (!$ret) {
- $this->logMessage("Could not add 'Derives_from' relationship for :uniquename and :subject.",
- [
- ':uniquename' => $feature->getValue('uniquename'),
- ':subject' => $subject,
- ], TRIPAL_WARNING);
- }
- }
- /**
- *
- */
- private function insertFeatureAnalysis() {
- $batch_size = 1000;
- $num_features = count(array_keys($this->features));
- $num_batches = (int) ($num_features / $batch_size) + 1;
- $this->setItemsHandled(0);
- $this->setTotalItems($num_batches);
- $init_sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id, significance) VALUES \n";
- $i = 0;
- $total = 0;
- $batch_num = 1;
- $args = [];
- foreach ($this->features as $info) {
- $findex = $info['findex'];
- $feature_id = $info['feature_id'];
- $feature = $this->getCachedFeature($findex);
- $i++;
- $total++;
- // If the feature is not skipped then add it to the table
- if (!$feature['skipped']) {
- $sql .= "(:feature_id_$i, :analysis_id_$i, :significance_$i),\n";
- $args[":feature_id_$i"] = $feature_id;
- $args[":analysis_id_$i"] = $this->analysis->getID();
- if (strcmp($feature['score'], '.') != 0) {
- $args[":significance_$i"] = $feature['score'];
- }
- else {
- $args[":significance_$i"] = NULL;
- }
- }
- // If we've reached the size of the batch then let's do the insert.
- if ($i == $batch_size or $total == $num_features) {
- if (count($args) > 0) {
- $sql = rtrim($sql, ",\n");
- $sql = $init_sql . $sql;
- chado_query($sql, $args);
- }
- $this->setItemsHandled($batch_num);
- $batch_num++;
- // Now reset all of the varables for the next batch.
- $sql = '';
- $i = 0;
- $args = [];
- }
- }
- }
- }
|