From 07c7583b4b5191ecbf551bd3db3de96177fd535b Mon Sep 17 00:00:00 2001
From: "louise.deleger" <louise.deleger@inra.fr>
Date: Thu, 7 Apr 2022 14:58:17 +0200
Subject: [PATCH 1/4] add files to ignore

---
 .gitignore | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/.gitignore b/.gitignore
index 7b829ec7..81c4959b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,29 @@ corpora/*/batch/*/uses.txt
 corpora/*/batch/*/words.txt
 corpora/*/batch/*/yatea-var/
 corpora/*/batch/*/yatea/
+corpora/*/batches/*/adb/
+corpora/*/batches/*/alvisnlp.log
+corpora/*/batches/*/anaphora.txt
+corpora/*/batches/*/bacteria.txt
+corpora/*/batches/*/dependencies.txt
+corpora/*/batches/*/doc-mesh.txt
+corpora/*/batches/*/geo.txt
+corpora/*/batches/*/habitats.txt
+corpora/*/batches/*/index-food/
+corpora/*/batches/*/index/
+corpora/*/batches/*/microorganisms-short.txt
+corpora/*/batches/*/microorganisms.txt
+corpora/*/batches/*/phenotype-relations.txt
+corpora/*/batches/*/phenotypes.txt
+corpora/*/batches/*/relations.txt
+corpora/*/batches/*/sentences.txt
+corpora/*/batches/*/success.txt
+corpora/*/batches/*/taxa.txt
+corpora/*/batches/*/uses-relations.txt
+corpora/*/batches/*/uses.txt
+corpora/*/batches/*/words.txt
+corpora/*/batches/*/yatea-var/
+corpora/*/batches/*/yatea/
 corpora/*/expander/
 corpora/*/batch/*/a2/
 corpora/BioNLP-OST-2019/batch/*/eval.json
-- 
GitLab


From e96287e66e63df9e5a6390d2b478887f454e5697 Mon Sep 17 00:00:00 2001
From: "louise.deleger" <louise.deleger@inra.fr>
Date: Thu, 7 Apr 2022 14:58:47 +0200
Subject: [PATCH 2/4] modif log file

---
 process_PubMed_corpus.snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile
index b86fec5e..8fd501c8 100644
--- a/process_PubMed_corpus.snakefile
+++ b/process_PubMed_corpus.snakefile
@@ -42,8 +42,8 @@ rule run_pubmed_entities:
 		phenotypes="corpora/pubmed/batches/{B}/phenotypes.txt",
 		uses="corpora/pubmed/batches/{B}/uses.txt",
 		index=directory("corpora/pubmed/batches/{B}/index")
-	log:"corpora/pubmed/batches/{B}/alvisnlp.log"
 	params:
+		logfile="alvisnlp.log",
 		batch="{B}",
 		corpus='pubmed',
         inhibitSyntax='inhibit-syntax',
@@ -62,7 +62,7 @@ rule run_pubmed_entities:
 	singularity:config["SINGULARITY_IMG"]
 	shell:"""
 		mkdir -p {params.dummy} && alvisnlp -J-XX:+UseSerialGC -J-Xmx20g -cleanTmp -verbose \
-		-log {log} \
+		-log {params.logfile} \
 		-alias format pubmed \
 		-alias input {input.file} \
 		-alias input-xslt {input.xslt} \
-- 
GitLab


From b49593a6c758db19245a453216b2050f66ed6a33 Mon Sep 17 00:00:00 2001
From: "louise.deleger" <louise.deleger@inra.fr>
Date: Thu, 7 Apr 2022 15:03:57 +0200
Subject: [PATCH 3/4] get Uses from same file as Habitats and Phenotypes

---
 preprocess-ontology.snakefile | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/preprocess-ontology.snakefile b/preprocess-ontology.snakefile
index d77135ab..e12ba9f9 100644
--- a/preprocess-ontology.snakefile
+++ b/preprocess-ontology.snakefile
@@ -9,9 +9,9 @@ HABITAT_ROOT='OBT:000001'
 PHENOTYPE_ROOT='OBT:000002'
 
 ## Use concept root
-USE_ROOT='EC:0000000'
+USE_ROOT='OBT:004185'
 
-ONTONAMES = 'BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype Use_V2'
+ONTONAMES = 'BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype BioNLP-OST+EnovFood-Use'
 
 
 '''
@@ -21,7 +21,7 @@ rule all:
 	input:
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.json', 
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.json',
-		'ancillaries/Use_V2.json',
+		'ancillaries/BioNLP-OST+EnovFood-Use.json',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.tomap',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		'ancillaries/food-process-lexicon.txt',
@@ -34,7 +34,7 @@ rule all:
 '''
 Remove obsolote concepts
 '''
-rule remove_obslete_concepts:
+rule remove_obsolete_concepts:
 	input:
 		onto='ancillaries/BioNLP-OST+EnovFood.obo'
 	output:
@@ -44,7 +44,7 @@ rule remove_obslete_concepts:
 
 
 '''
-Cut subtrees that are not used + separate between biotopes and phenotypes and molecular entities
+Cut Habitat subtree
 '''
 rule cut_subtrees_habitat:
 	input:
@@ -60,7 +60,7 @@ rule cut_subtrees_habitat:
 			"""
 
 '''
-Cut subtrees that are not used + separate between biotopes and phenotypes and molecular entities
+Cut Phenotype subtree
 '''
 rule cut_subtrees_phenotype:
 	input:
@@ -74,6 +74,22 @@ rule cut_subtrees_phenotype:
 			--include-root {PHENOTYPE_ROOT} \
 			{input.onto} > {output.onto}
 			"""
+            
+'''
+Cut Use subtree
+'''
+rule cut_subtrees_use:
+	input:
+		onto='ancillaries/BioNLP-OST+EnovFood-no-obsolete.obo'
+	output:
+		onto='ancillaries/BioNLP-OST+EnovFood-Use.obo'
+	conda: 'softwares/envs/obo-utils-env.yaml'
+	shell: """
+			python softwares/obo-utils/obo-subtree.py \
+			--default-exclude \
+			--include-root {USE_ROOT} \
+			{input.onto} > {output.onto}
+			"""
 
 
 '''
@@ -192,9 +208,9 @@ convert use results to json
 '''
 rule convert_obo2json_use:
 	input:
-		obo='ancillaries/Use_V2.obo'
+		obo='ancillaries/BioNLP-OST+EnovFood-Use.obo'
 	output:
-		json='ancillaries/Use_V2.json'
+		json='ancillaries/BioNLP-OST+EnovFood-Use.json'
 	conda: 'softwares/envs/obo-utils-env.yaml'
 	shell: 'python softwares/obo-utils/obo2json.py --root {USE_ROOT} {input.obo} > {output.json}'
 
-- 
GitLab


From 5c2e8670c635ea6af9fde04420fce56eec5c2e12 Mon Sep 17 00:00:00 2001
From: "louise.deleger" <louise.deleger@inra.fr>
Date: Thu, 7 Apr 2022 15:54:22 +0200
Subject: [PATCH 4/4] update Use ontology name

---
 all-bis.snakefile                     | 16 ++++++++--------
 all.snakefile                         | 16 ++++++++--------
 ancillaries/expander.xml              |  4 ++--
 config/config.yaml                    |  4 ++--
 generate_concept_path.snakefile       |  2 +-
 plans/use-extraction.plan             |  4 ++--
 process-evaluate_BioNLP-OST.snakefile |  2 +-
 process_PubMed_corpus.snakefile       |  4 ++--
 8 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/all-bis.snakefile b/all-bis.snakefile
index b7b8dbbe..c6b466b9 100644
--- a/all-bis.snakefile
+++ b/all-bis.snakefile
@@ -14,28 +14,28 @@ rule all:
 		expander_folder='corpora/pubmed/expander'
 		#onto_habitat_json='ancillaries/BioNLP-OST+EnovFood-Habitat.json',
 		#onto_phenotype_json='ancillaries/BioNLP-OST+EnovFood-Phenotype.json',
-		#onto_use_json='ancillaries/Use_V2.json'
+		#onto_use_json='ancillaries/BioNLP-OST+EnovFood-Use.json'
 
 
 rule preprocess_ontology:
 	input:
 		ontobiotope='ancillaries/BioNLP-OST+EnovFood.obo',
-		use_onto='ancillaries/Use_V2.obo',
+		use_onto='ancillaries/BioNLP-OST+EnovFood-Use.obo',
 		names='ancillaries/extended-microorganisms-taxonomy/names.dmp'
 	output:
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.json',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.json',
-		'ancillaries/Use_V2.json',
+		'ancillaries/BioNLP-OST+EnovFood-Use.json',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.tomap',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
-		#'ancillaries/Use_V2.tomap',
+		#'ancillaries/BioNLP-OST+EnovFood-Use.tomap',
 		'ancillaries/food-process-lexicon.txt',
 		'ancillaries/NCBI_taxa_ontobiotope.txt',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.obo',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.obo',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.paths',
-		'ancillaries/Use_V2.paths'
+		'ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	shell: """snakemake --verbose \
 	    --printshellcmds \
 	    --use-singularity \
@@ -77,7 +77,7 @@ rule process_genbank_corpus:
 		phenotype_tomap='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		habitat_paths='ancillaries/BioNLP-OST+EnovFood-Habitat.paths', 
 		phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
-		use_paths='ancillaries/Use_V2.paths'
+		use_paths='ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	output:
 		'corpora/genbank/test-3.2.txt'
 	shell: """snakemake --verbose \
@@ -99,7 +99,7 @@ rule process_dsmz_corpus:
 		phenotype_tomap='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		habitat_paths='ancillaries/BioNLP-OST+EnovFood-Habitat.paths', 
 		phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
-		use_paths='ancillaries/Use_V2.paths'
+		use_paths='ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	output:
 		'corpora/dsmz/test-3.3.txt'
 	shell: """snakemake --verbose \
@@ -122,7 +122,7 @@ rule process_pubmed_corpus:
 		phenotype_tomap='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		habitat_paths='ancillaries/BioNLP-OST+EnovFood-Habitat.paths', 
 		phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
-		use_paths='ancillaries/Use_V2.paths'
+		use_paths='ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	output:
 		expander_folder=directory("corpora/pubmed/expander"),
 		index_folder=directory("corpora/pubmed/index"),
diff --git a/all.snakefile b/all.snakefile
index cd859c42..e71e9497 100644
--- a/all.snakefile
+++ b/all.snakefile
@@ -14,27 +14,27 @@ rule all:
 		expander_folder='corpora/pubmed/expander'
 		#onto_habitat_json='ancillaries/BioNLP-OST+EnovFood-Habitat.json',
 		#onto_phenotype_json='ancillaries/BioNLP-OST+EnovFood-Phenotype.json',
-		#onto_use_json='ancillaries/Use_V2.json'
+		#onto_use_json='ancillaries/BioNLP-OST+EnovFood-Use.json'
 
 
 rule preprocess_ontology:
 	input:
 		ontobiotope='ancillaries/BioNLP-OST+EnovFood.obo',
-		use_onto='ancillaries/Use_V2.obo',
+		use_onto='ancillaries/BioNLP-OST+EnovFood-Use.obo',
 		names='ancillaries/extended-microorganisms-taxonomy/names.dmp'
 	output:
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.json',
-		'ancillaries/Use_V2.json',
+		'ancillaries/BioNLP-OST+EnovFood-Use.json',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.tomap',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
-		#'ancillaries/Use_V2.tomap',
+		#'ancillaries/BioNLP-OST+EnovFood-Use.tomap',
 		'ancillaries/food-process-lexicon.txt',
 		'ancillaries/NCBI_taxa_ontobiotope.txt',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.obo',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.obo',
 		'ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
 		'ancillaries/BioNLP-OST+EnovFood-Habitat.paths',
-		'ancillaries/Use_V2.paths'
+		'ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	shell: """snakemake --verbose \
 	    --printshellcmds \
 	    --use-singularity \
@@ -79,7 +79,7 @@ rule process_genbank_corpus:
 		phenotype_tomap='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		habitat_paths='ancillaries/BioNLP-OST+EnovFood-Habitat.paths', 
 		phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
-		use_paths='ancillaries/Use_V2.paths'
+		use_paths='ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	output:
 		'corpora/genbank/test-3.2.txt'
 	shell: """snakemake --verbose \
@@ -103,7 +103,7 @@ rule process_dsmz_corpus:
 		phenotype_tomap='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		habitat_paths='ancillaries/BioNLP-OST+EnovFood-Habitat.paths', 
 		phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
-		use_paths='ancillaries/Use_V2.paths'
+		use_paths='ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	output:
 		'corpora/dsmz/test-3.3.txt'
 	shell: """snakemake --verbose \
@@ -128,7 +128,7 @@ rule process_pubmed_corpus:
 		phenotype_tomap='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
 		habitat_paths='ancillaries/BioNLP-OST+EnovFood-Habitat.paths', 
 		phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths',
-		use_paths='ancillaries/Use_V2.paths'
+		use_paths='ancillaries/BioNLP-OST+EnovFood-Use.paths'
 	output:
 		expander_folder=directory("corpora/pubmed/expander"),
 		index_folder=directory("corpora/pubmed/index"),
diff --git a/ancillaries/expander.xml b/ancillaries/expander.xml
index 6554dc12..c8b2c518 100644
--- a/ancillaries/expander.xml
+++ b/ancillaries/expander.xml
@@ -26,11 +26,11 @@
   </obo>
 
   <obo>
-    <source>ancillaries/Use_V2.obo</source>
+    <source>ancillaries/BioNLP-OST+EnovFood-Use.obo</source>
     <prefix>{use}</prefix>
     <suffix>/</suffix>
     <type>use</type>
-    <json-property root-id="EC:0000000">ontology.OntoBiotope-Use.json</json-property>
+    <json-property root-id="OBT:004185">ontology.OntoBiotope-Use.json</json-property>
   </obo>
 
 </compound>
diff --git a/config/config.yaml b/config/config.yaml
index 3a32b1d6..154fc272 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -2,9 +2,9 @@
 
 ONTOBIOTOPE: "ancillaries/BioNLP-OST+EnovFood.obo"
 
-USE: "ancillaries/Use_V2.obo"
+USE: "ancillaries/BioNLP-OST+EnovFood-Use.obo"
 
-ONTONAMES: "BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype Use_V2"
+ONTONAMES: "BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype BioNLP-OST+EnovFood-Use"
 
 NCBI_TAXO_ID: "ancillaries/supertaxo/taxa+id_full.txt"
 
diff --git a/generate_concept_path.snakefile b/generate_concept_path.snakefile
index e60cc9ff..ca1ba773 100644
--- a/generate_concept_path.snakefile
+++ b/generate_concept_path.snakefile
@@ -3,7 +3,7 @@ configfile: "config/config.yaml"
 
 
 
-ONTONAMES = 'BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype Use_V2'
+ONTONAMES = 'BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype BioNLP-OST+EnovFood-Use'
 
 
 
diff --git a/plans/use-extraction.plan b/plans/use-extraction.plan
index 87195d8d..60d70fc4 100644
--- a/plans/use-extraction.plan
+++ b/plans/use-extraction.plan
@@ -2,7 +2,7 @@
 <alvisnlp-plan id="Use-extraction">
 
   <exact-match class="OBOProjector">
-    <oboFiles>ancillaries/Use_V2.obo</oboFiles>
+    <oboFiles>ancillaries/BioNLP-OST+EnovFood-Use.obo</oboFiles>
     <targetLayerName>uses</targetLayerName>
     <subject feature="form" layer="words"/>
     <idFeature>concept-id</idFeature>
@@ -12,7 +12,7 @@
   </exact-match>
 
   <exact-match-2 class="OBOProjector">
-    <oboFiles>ancillaries/Use_V2.obo</oboFiles>
+    <oboFiles>ancillaries/BioNLP-OST+EnovFood-Use.obo</oboFiles>
     <targetLayerName>uses2</targetLayerName>
     <subject feature="lemma" layer="words"/>
     <idFeature>concept-id</idFeature>
diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile
index e50a8be9..c81ae2cd 100644
--- a/process-evaluate_BioNLP-OST.snakefile
+++ b/process-evaluate_BioNLP-OST.snakefile
@@ -46,7 +46,7 @@ rule run_bionlp_prediction:
         tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
         graylist='ancillaries/graylist_extended.heads',
 		emptywords='ancillaries/stopwords_EN.ttg',
-        ontobiotopeUse='ancillaries/Use_V2.obo',
+        ontobiotopeUse='ancillaries/BioNLP-OST+EnovFood-Use.obo',
 		plan='plans/entities.plan',
 		dir='corpora/BioNLP-OST-2019/batches/{B}/',
 		taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt',
diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile
index 8fd501c8..b4b21081 100644
--- a/process_PubMed_corpus.snakefile
+++ b/process_PubMed_corpus.snakefile
@@ -53,7 +53,7 @@ rule run_pubmed_entities:
         tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap',
         graylist='ancillaries/graylist_extended.heads',
 		emptywords='ancillaries/stopwords_EN.ttg',
-        ontobiotopeUse='ancillaries/Use_V2.obo',
+        ontobiotopeUse='ancillaries/BioNLP-OST+EnovFood-Use.obo',
 		plan='plans/entities.plan',
 		dir='corpora/pubmed/batches/{B}/',
 		taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt',
@@ -133,7 +133,7 @@ rule create_pubmed_expander:
 		taxa_id_microorganisms="ancillaries/extended-microorganisms-taxonomy/taxa+id_microorganisms.txt",
                 onto_habitat="ancillaries/BioNLP-OST+EnovFood-Habitat.obo",
 		onto_phenotype="ancillaries/BioNLP-OST+EnovFood-Phenotype.obo",
-		onto_use="ancillaries/Use_V2.obo"
+		onto_use="ancillaries/BioNLP-OST+EnovFood-Use.obo"
 	output:
 		expander_folder=directory("corpora/pubmed/expander")
 	params:
-- 
GitLab