From 5bbac56d2a331b1b3f15b95d39bc957738fec00f Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:20:07 +0900 Subject: [PATCH 01/70] aist new license --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 3269683..e81a63a 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2019 National Institute of Advanced Industrial Science and Technology (AIST) + Copyright National Institute of Advanced Industrial Science and Technology (AIST), AIST-Product-ID: 2022PRO-2776 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 230cf364c99995595b54784a70e9c80b4dfee028 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:20:35 +0900 Subject: [PATCH 02/70] setup environment via conda --- README.md | 35 ++++++++++++++++++++++++++++------- setup/conda-activate.sh | 6 ++++++ setup/conda-create.sh | 7 +++++++ setup/conda-install.sh | 9 +++++++++ setup/install-brat.sh | 10 ++++++++++ 5 files changed, 60 insertions(+), 7 deletions(-) create mode 100644 setup/conda-activate.sh create mode 100644 setup/conda-create.sh create mode 100644 setup/conda-install.sh create mode 100644 setup/install-brat.sh diff --git a/README.md b/README.md index f10213e..e669e34 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,36 @@ A deep leanring model to predict named entities, triggers, and nested events fro - [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz) # 2. Preparation -## 2.1. Requirements +## 2.1. Environment + +1. Install conda environment + +```bash +sh setup/conda-install.sh +``` + +2. Create a conda environment (for the first time) + +```bash +. setup/conda-create.sh +``` + +3. Activate the conda environment + +- Every time you run: before installing packages, before running evaluation scripts, etc + +```bash +. setup/conda-activate.sh +``` + +4. Install requirements - Python 3.6.5 - PyTorch (torch==1.1.0 torchvision==0.3.0, cuda92) +- Python dependencies + +```bash +pip install -r requirements.txt +``` ```bash virtualenv -p python3 pytorch-env @@ -42,12 +69,6 @@ export CUDA_VISIBLE_DEVICES=0 CUDA_PATH=/usr/local/cuda pip install torch==1.1.0 torchvision==0.3.0 ``` -- Install Python packages - -```bash -sh install.sh -``` - ## 2.2. BERT - Download SciBERT BERT model from PyTorch AllenNLP diff --git a/setup/conda-activate.sh b/setup/conda-activate.sh new file mode 100644 index 0000000..06ded76 --- /dev/null +++ b/setup/conda-activate.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +ROOT=$PWD +source $ROOT/miniconda3/bin/activate deepeventmine-conda-env + +echo "Activated conda environment: deepeventmine-conda-env" \ No newline at end of file diff --git a/setup/conda-create.sh b/setup/conda-create.sh new file mode 100644 index 0000000..c9fd537 --- /dev/null +++ b/setup/conda-create.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +ROOT=$PWD +source $ROOT/miniconda3/bin/activate +conda create -n deepeventmine-conda-env python=3.6 + +echo "Created conda environment: deepeventmine-conda-env" \ No newline at end of file diff --git a/setup/conda-install.sh b/setup/conda-install.sh new file mode 100644 index 0000000..40a0d2d --- /dev/null +++ b/setup/conda-install.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +ROOT=$PWD + +# install conda +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh -p $ROOT/miniconda3 + +echo "Done!" \ No newline at end of file diff --git a/setup/install-brat.sh b/setup/install-brat.sh new file mode 100644 index 0000000..bf72df2 --- /dev/null +++ b/setup/install-brat.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +ROOT=$PWD +BRAT_DIR="$ROOT/brat" + +git clone https://github.com/nlplab/brat.git + +cd $BRAT_DIR +./install.sh -u + From 8bbf011a448ebc7012e501bd5fd9531ce05c6cc5 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:26:30 +0900 Subject: [PATCH 03/70] update requirements --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9e5c209..a74894f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,6 @@ texttable sklearn pytorch-nlp tqdm -requests \ No newline at end of file +requests +torch +torchvision \ No newline at end of file From e1a812d10457037886ed0ca67686855e54e08ea7 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:28:08 +0900 Subject: [PATCH 04/70] torch version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a74894f..d31bf44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,5 @@ sklearn pytorch-nlp tqdm requests -torch -torchvision \ No newline at end of file +torch==1.1.0 +torchvision==0.3.0 \ No newline at end of file From 1428e995e17f2b8f0da010f98ec5fb236be0aab5 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:46:22 +0900 Subject: [PATCH 05/70] prepare for training cg --- README.md | 69 +++++++++++++++++--------------------- run/train/download-bert.sh | 18 ++++++++++ run/train/prepare-cg.sh | 63 ++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 38 deletions(-) create mode 100644 run/train/download-bert.sh create mode 100644 run/train/prepare-cg.sh diff --git a/README.md b/README.md index e669e34..82d0c43 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ -# 1. DeepEventMine +# DeepEventMine A deep leanring model to predict named entities, triggers, and nested events from biomedical texts. - The model and results are reported in our paper: [DeepEventMine: End-to-end Neural Nested Event Extraction from Biomedical Texts](https://doi.org/10.1093/bioinformatics/btaa540), Bioinformatics, 2020. -## 1.1. Features +## Overview +1. Features - Based on [pre-trained BERT](https://github.com/allenai/scibert) - Predict nested entities and nested events - Provide our trained models on the seven biomedical tasks @@ -13,7 +14,7 @@ A deep leanring model to predict named entities, triggers, and nested events fro - Predict for new data given raw text input or PubMed ID - Visualize the predicted entities and events on the [brat](http://brat.nlplab.org) -## 1.2. Tasks +2. Tasks - DeepEventMine has been trained and evaluated on the following tasks (six BioNLP shared tasks and MLEE). @@ -25,14 +26,7 @@ A deep leanring model to predict named entities, triggers, and nested events fro 6. pc: [Pathway Curation (PC), 2013](http://2013.bionlp-st.org/tasks/pathway-curation) 7. mlee: [Multi-Level Event Extraction (MLEE)](http://nactem.ac.uk/MLEE/) -## 1.3. Our trained models and scores - -- [Our trained models](https://b2share.eudat.eu/records/80d2de0c57d64419b722dc1afa375f28) -- [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz) - -# 2. Preparation -## 2.1. Environment - +# 1. Preparation 1. Install conda environment ```bash @@ -62,42 +56,32 @@ sh setup/conda-install.sh pip install -r requirements.txt ``` -```bash -virtualenv -p python3 pytorch-env -source pytorch-env/bin/activate -export CUDA_VISIBLE_DEVICES=0 -CUDA_PATH=/usr/local/cuda pip install torch==1.1.0 torchvision==0.3.0 -``` - -## 2.2. BERT -- Download SciBERT BERT model from PyTorch AllenNLP +5. [Brat](https://github.com/nlplab/brat) for visualization +- brat instructions](http://brat.nlplab.org/installation.html) ```bash -sh download.sh bert +sh setup/install-brat.sh +python2 standalone.py ``` -## 2.3. DeepEventMine -- Download pre-trained DeepEventMine model on a given task -- [task] = cg (or pc, ge11, epi, etc) +# 2. Training CG +1. Download data and process +- Download data +- Process data to appropriate format +- Tokenize texts and retrieve offsets +- Data statistics +- Download the processed event structures ```bash -sh download.sh deepeventmine [task] +sh run/train/prepare-cg.sh ``` -## 2.4 Brat -- To visualize the output using the [brat](http://brat.nlplab.org) -- Download [brat v1.3](http://brat.nlplab.org) - +2. Download models +- Download SciBERT model from PyTorch AllenNLP ```bash -sh download.sh brat +sh run/train/download-bert.sh ``` -- Install brat based on the [brat instructions](http://brat.nlplab.org/installation.html) -```bash -cd brat/brat-v1.3_Crunchy_Frog/ -./install.sh -u -python2 standalone.py -``` # 3. Predict (BioNLP tasks) @@ -110,13 +94,22 @@ python2 standalone.py sh download.sh bionlp [task] ``` -2. Preprocess data +2. Download our pre-trained DeepEventMine model on a given task +- [Our trained models](https://b2share.eudat.eu/records/80d2de0c57d64419b722dc1afa375f28) +- [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz) +- [task] = cg (or pc, ge11, epi, etc) + +```bash +sh download.sh deepeventmine [task] +``` + +3. Preprocess data - Tokenize texts and prepare data for prediction ```bash sh preprocess.sh bionlp ``` -3. Generate configs +4. Generate configs - If using GPU: [gpu] = 0, otherwise: [gpu] = -1 - [task] = cg, pc, etc ```bash diff --git a/run/train/download-bert.sh b/run/train/download-bert.sh new file mode 100644 index 0000000..0990c9d --- /dev/null +++ b/run/train/download-bert.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +root_dir=$PWD + +# download BERT model +BERT_DIR="data/bert" +mkdir -p $BERT_DIR +cd $BERT_DIR + +wget https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/pytorch_models/scibert_scivocab_cased.tar +tar -xvf scibert_scivocab_cased.tar +rm *.tar + +cd scibert_scivocab_cased +tar -xzvf weights.tar.gz +rm *.tar.gz + +cd $root_dir diff --git a/run/train/prepare-cg.sh b/run/train/prepare-cg.sh new file mode 100644 index 0000000..8464d86 --- /dev/null +++ b/run/train/prepare-cg.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# to root +ROOT=$PWD + +# 1. DOWNLOAD DATA +echo "-----------------------------" +echo "1. Download CG data" + +CG_DIR="data/original_corpora/cg" + +mkdir -p $CG_DIR + +cd $CG_DIR + +wget http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_training_data.tar.gz +wget http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_development_data.tar.gz +wget http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_test_data.tar.gz + +tar -xzvf BioNLP-ST_2013_CG_training_data.tar.gz +tar -xzvf BioNLP-ST_2013_CG_development_data.tar.gz +tar -xzvf BioNLP-ST_2013_CG_test_data.tar.gz + +mv BioNLP-ST_2013_CG_training_data train +mv BioNLP-ST_2013_CG_development_data dev +mv BioNLP-ST_2013_CG_test_data test + +rm *.tar.gz + +cd $ROOT + +# 2. TOKENIZATION +echo "-----------------------------" +echo "2. Tokenization" + +input_dir=data/original_corpora/ +output_dir=data/corpora/ +python scripts/preprocess.py \ + --indir $input_dir \ + --outdir $output_dir + +cd $ROOT + +# 3. EVENT STRUCTURES +echo "-----------------------------" +echo "3. Download processed event structures" + +# our processed event structures data +FILEID=1qzjY38RdnCvomVAD3XMk_J4pi1gIraA5 + +CG_EVENT_STRUCTURES="data/general_rules" +mkdir -p $CG_EVENT_STRUCTURES +cd $CG_EVENT_STRUCTURES + +gdown https://drive.google.com/uc?id=$FILEID +tar -xzvf cg.tar.gz +rm *.tar.gz + +cd $ROOT + +# DONE +echo "-----------------------------" +echo "Done!" \ No newline at end of file From e5202ca8ce38ef454fbc4a6d0a24f65aeb6899f4 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:48:43 +0900 Subject: [PATCH 06/70] event structures --- run/train/prepare-cg.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run/train/prepare-cg.sh b/run/train/prepare-cg.sh index 8464d86..9c4183f 100644 --- a/run/train/prepare-cg.sh +++ b/run/train/prepare-cg.sh @@ -48,7 +48,7 @@ echo "3. Download processed event structures" # our processed event structures data FILEID=1qzjY38RdnCvomVAD3XMk_J4pi1gIraA5 -CG_EVENT_STRUCTURES="data/general_rules" +CG_EVENT_STRUCTURES="data/event-structures" mkdir -p $CG_EVENT_STRUCTURES cd $CG_EVENT_STRUCTURES From c889d8aaa44f203fb8da8e88bf90453aa121618d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 15:56:26 +0900 Subject: [PATCH 07/70] generate training configs --- README.md | 5 + configs/cg.yaml | 4 +- configs/default.yaml | 130 +++++++++++++++-- run/train/generate-configs.sh | 9 ++ scripts/generate_configs.py | 267 ++++++++++++++++++++++++---------- 5 files changed, 317 insertions(+), 98 deletions(-) create mode 100644 run/train/generate-configs.sh diff --git a/README.md b/README.md index 82d0c43..f8fba47 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,11 @@ sh run/train/prepare-cg.sh sh run/train/download-bert.sh ``` +3. Generate configs +```bash +sh run/train/generate_configs.sh cg basic +``` +- Experiment name: basic, exp1, exp2, etc # 3. Predict (BioNLP tasks) diff --git a/configs/cg.yaml b/configs/cg.yaml index 187de59..9b05ddd 100644 --- a/configs/cg.yaml +++ b/configs/cg.yaml @@ -1,2 +1,2 @@ -# Configs for specific task -a2_entities: [DNA_domain_or_region,Protein_domain_or_region] \ No newline at end of file +# Configuration for CG task. +a2_entities: [DNA_domain_or_region, Protein_domain_or_region] \ No newline at end of file diff --git a/configs/default.yaml b/configs/default.yaml index 67c56ca..8e02934 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -1,25 +1,123 @@ -# Configuration file. +# Configuration file for training model. +# data path task_name: cg +train_data: data/corpora/cg/train/ +dev_data: data/corpora/cg/dev/ +test_data: data/corpora/cg/dev/ -# model bert_model: data/bert/scibert_scivocab_cased -model_path: data/models/cg/model/ -saved_params: data/models/cg/cg.param -# data -test_data: .. +# saving options +result_dir: .. +save_params: False +save_ner: False +save_rel: False +save_ev: False +save_all_models: False +save_model_pipeline: False + +# hyperparameters +epoch: 100 +batchsize: 16 +learning_rate: 3e-5 +dropout: 0.3 +fp16: False +gpu: 0 + +# dimensions +bert_dim: 768 +hidden_dim: 1000 +etype_dim: 300 # entity type +rtype_dim: 150 # relation type +role_dim: 1000 # event argument + +ner_reduce: False +ner_reduced_size: 500 +rel_reduced_size: 500 +ev_reduced_size: 500 + +# bert +seed: 42 +gradient_accumulation_steps: 1 +loss_scale: 128 +warmup_proportion: 0.1 +max_seq: 128 +bert_warmup_lr: True + +# ner config +ner_label_limit: 2 # nested level +ner_threshold: 0.5 +max_entity_width: 14 +max_trigger_width: 10 + +# event config +ev_threshold: 0.5 +max_ev_level: 3 # nested level +max_ev_args: 4 # maximum number of arguments +ev_nested_epoch: 0 +modality_epoch: 0 +flat_ev_scale: 1 +nest_ev_scale: 1 +modality_weight: 1 +use_dev_rule: False +use_general_rule: False +rule_dir: data/event-structures/cg/ -# eval -ev_eval_script_path: eval/scripts/eval-ev-cg.py +# training options +use_gold_ner: False +use_gold_rel: False +ner_predict_all: True # True/False: predict entity or not +skip_ner: False +skip_rel: False +ner_epoch: 100 +rel_epoch: 100 + +freeze_bert: False +freeze_ner: False +freeze_rel: False + +ner_epoch_limit: -1 +rel_epoch_limit: -1 +ner_loss_weight_main: 1 +rel_loss_weight_minor: 1 +ev_loss_weight_minor: 1 +ner_loss_weight_minor: 1 +rel_loss_weight_main: 1 +ev_loss_weight_main: 1 + +direction: l2r+r2l +lab2ign: 1:Other:2 +include_nested: True +enable_triggers_pair: True +train: True +rel_lb_weight: -1 + +lowercase : False +filter_no_ent_sents: False +use_context: True +min_w_freq: 1 # for singletonW +unk_w_prob: 0.01 # for pre_sentences + +# lstm +use_lstm: False +pretrain_word_model: word-embs/PubMed-shuffle-win-2-sample200.txt + +# evaluation +ner_eval_corpus: cg +rel_eval_script_path: eval/scripts/n2c2.py +ev_eval_script_path: eval/scripts/eval-ev-cg.py # for event +ev_matching: -sp # s: softboundary, p: partialrecursive a2_entities: [] -raw_text: False -ner_predict_all: False -# output -result_dir: .. +# prediction setting +predict: False +pipelines: False +t_batch_size: 16 +t_gpu: 0 +t_fp16: False +gold_eval: False -# params -gpu: -1 -batchsize: 16 -seed: 42 \ No newline at end of file +# display options +stats: True # print relations +show_macro: False # print result for relations \ No newline at end of file diff --git a/run/train/generate-configs.sh b/run/train/generate-configs.sh new file mode 100644 index 0000000..c2df32d --- /dev/null +++ b/run/train/generate-configs.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +experiment_dir="experiments/" +mkdir -p $experiment_dir + +task=$1 +setting=$2 + +python scripts/generate_configs.py $experiment_dir $task $setting diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index 4e07162..b6707e9 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -10,63 +10,165 @@ def write_config(datapath, config): """Write config to file""" + # with open(datapath, 'w') as outfile: + # yaml.dump(config, outfile, default_flow_style=False, sort_keys=False) + with open(datapath, 'w') as outfile: for key, value in config.items(): # format - if key == 'bert_model' or key == 'test_data' or key == 'ev_eval_script_path' or key == 'result_dir' or key == 'gpu': + if key == 'result_dir': + outfile.write('\n') + if key == 'epoch': + outfile.write('\n') + if key == 'bert_dim': + outfile.write('\n') + if key == 'ner_reduce': + outfile.write('\n') + if key == 'seed': + outfile.write('\n') + if key == 'ner_label_limit': + outfile.write('\n') + if key == 'ev_threshold': + outfile.write('\n') + if key == 'use_gold_ner': + outfile.write('\n') + if key == 'freeze_bert': + outfile.write('\n') + if key == 'ner_epoch_limit': + outfile.write('\n') + if key == 'direction': + outfile.write('\n') + if key == 'ner_eval_corpus': + outfile.write('\n') + if key == 'predict': outfile.write('\n') outfile.write('{}: {}'.format(key, value)) outfile.write('\n') -def gen_predict_config(predict_config, specific_config, eval_set, config_dir, model_name, taskdir): - """For joint prediction""" +def gen_ner_config(ner_config, task_config, config_dir, taskdir): + """For entity""" - # dev and test sets - if eval_set == 'dev' or eval_set == 'test': - predict_config['test_data'] = ''.join(["data/corpora/", model_name, "/", eval_set, "/"]) - predict_config['result_dir'] = ''.join([taskdir, '/predict-gold-', eval_set, '/']) + ner_config['result_dir'] = ''.join([taskdir, 'ner/']) + ner_config['ner_model_dir'] = ''.join([taskdir, 'ner/model/']) + ner_config['save_ner'] = True - # overwrite task config - overwrite_task_config(predict_config, specific_config) + # overwrite task config + overwrite_task_config(ner_config, task_config) - write_config(os.path.join(config_dir, ''.join(['predict-gold-', eval_set, '.yaml'])), predict_config) + write_config(os.path.join(config_dir, 'train-ner.yaml'), ner_config) - # for raw texts - elif eval_set == 'raw-text': - predict_config['test_data'] = ''.join(["data/processed-raw-text/", model_name, "/"]) - predict_config['result_dir'] = ''.join([taskdir, '/predict-', eval_set, '/']) - predict_config['raw_text'] = True - predict_config['ner_predict_all'] = True - # overwrite task config - overwrite_task_config(predict_config, specific_config) +def gen_rel_config(rel_config, task_config, config_dir, taskdir): + """For relation""" - write_config(os.path.join(config_dir, ''.join(['predict-', eval_set, '.yaml'])), predict_config) + rel_config['result_dir'] = ''.join([taskdir, 'rel/']) + rel_config['rel_model_dir'] = ''.join([taskdir, 'rel/model/']) + rel_config['save_rel'] = True + rel_config['use_gold_ner'] = True + rel_config['use_gold_rel'] = False + rel_config['ner_predict_all'] = False + rel_config['skip_ner'] = True + rel_config['ner_epoch'] = -1 + # overwrite task config + overwrite_task_config(rel_config, task_config) + + write_config(os.path.join(config_dir, 'train-rel.yaml'), rel_config) + + +def gen_ev_config(ev_config, task_config, config_dir, taskdir): + """For event""" + + ev_config['result_dir'] = ''.join([taskdir, 'ev/']) + ev_config['ev_model_dir'] = ''.join([taskdir, 'ev/model/']) + ev_config['save_ev'] = True + ev_config['ev_nested_epoch'] = 20 + ev_config['modality_epoch'] = 20 + ev_config['use_general_rule'] = True + ev_config['use_gold_ner'] = True + ev_config['use_gold_rel'] = True + ev_config['ner_predict_all'] = False + ev_config['skip_ner'] = True + ev_config['skip_rel'] = True + ev_config['ner_epoch'] = -1 + ev_config['rel_epoch'] = -1 + ev_config['freeze_bert'] = True + ev_config['freeze_ner'] = True + ev_config['freeze_rel'] = True -def gen_predict_config_pubmed(predict_config, specific_config, config_dir, expdir, dataname): - predict_config['test_data'] = ''.join(["data/", dataname, "/processed-text/", "text/"]) - predict_config['result_dir'] = ''.join([expdir, dataname, '/results/']) - predict_config['raw_text'] = True - predict_config['ner_predict_all'] = True + # overwrite task config + overwrite_task_config(ev_config, task_config) + + write_config(os.path.join(config_dir, 'train-ev.yaml'), ev_config) + + +def gen_joint_config(joint_config, task_config, config_dir, taskdir): + """For joint""" + + joint_config['result_dir'] = ''.join([taskdir, 'joint-gold/']) + joint_config['joint_model_dir'] = ''.join([taskdir, 'joint-gold/model/']) + joint_config['save_params'] = True + joint_config['save_all_models'] = True + joint_config['use_general_rule'] = True + joint_config['ner_model_dir'] = ''.join([taskdir, 'ner/model/']) + joint_config['rel_model_dir'] = ''.join([taskdir, 'rel/model/']) + joint_config['ev_model_dir'] = ''.join([taskdir, 'ev/model/']) + joint_config['ner_predict_all'] = False + joint_config['ner_epoch'] = -1 + joint_config['rel_epoch'] = -1 + joint_config['ner_epoch_limit'] = 70 + joint_config['rel_epoch_limit'] = 90 + joint_config['rel_loss_weight_minor'] = 0.001 + joint_config['ev_loss_weight_minor'] = 0.001 + joint_config['ner_loss_weight_minor'] = 0.5 + joint_config['rel_loss_weight_main'] = 0.5 + joint_config['ev_loss_weight_main'] = 0.1 # overwrite task config - overwrite_task_config(predict_config, specific_config) - write_config(os.path.join(config_dir, ''.join(['predict-', dataname, '.yaml'])), predict_config) + overwrite_task_config(joint_config, task_config) + write_config(os.path.join(config_dir, 'train-joint-gold.yaml'), joint_config) -def overwrite_task_config(config, specific_config): - """Overwrite config for specific task.""" - # add specific task config - for key, value in specific_config.items(): - if key in config: - config[key] = value +def gen_joint_e2e_config(joint_e2e_config, task_config, config_dir, taskdir): + """For joint end-to-end""" - return config + joint_e2e_config['result_dir'] = ''.join([taskdir, 'joint-e2e/']) + joint_e2e_config['joint_model_dir'] = ''.join([taskdir, 'joint-e2e/model/']) + joint_e2e_config['ner_predict_all'] = True + + # overwrite task config + overwrite_task_config(joint_e2e_config, task_config) + + write_config(os.path.join(config_dir, 'train-joint-e2e.yaml'), joint_e2e_config) + + +def gen_predict_config(predict_config, eval_set, config_dir, taskdir): + """For joint prediction""" + + predict_config['test_data'] = predict_config['test_data'].replace('dev', eval_set) + predict_config['result_dir'] = ''.join([taskdir, 'predict-gold-', eval_set, '/']) + predict_config['save_params'] = False + predict_config['joint_model_dir'] = ''.join([taskdir, 'joint-gold/model/']) + predict_config['params'] = ''.join([taskdir, 'joint-gold/', predict_config['task_name'], '.param']) + predict_config['predict'] = True + predict_config['ner_predict_all'] = False + + write_config(os.path.join(config_dir, ''.join(['predict-gold-', eval_set, '.yaml'])), predict_config) + + +def gen_predict_e2e_config(predict_e2e_config, eval_set, config_dir, taskdir): + """For joint end-to-end prediction""" + + predict_e2e_config['result_dir'] = ''.join([taskdir, 'predict-e2e-', eval_set, '/']) + predict_e2e_config['joint_model_dir'] = ''.join([taskdir, 'joint-e2e/model/']) + predict_e2e_config['params'] = ''.join([taskdir, 'joint-e2e/', predict_e2e_config['task_name'], '.param']) + predict_e2e_config['ner_predict_all'] = True + + write_config(os.path.join(config_dir, ''.join(['predict-e2e-', eval_set, '.yaml'])), predict_e2e_config) def read_specific_config(task): @@ -85,11 +187,23 @@ def read_specific_config(task): return specific_config -def generate_configs(taskdir, task, gpu): +def overwrite_task_config(config, specific_config): + """Overwrite config for specific task.""" + + # add specific task config + for key, value in specific_config.items(): + if key in config: + config[key] = value + + return config + + +def generate_configs(expdir, task, setting): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(taskdir, 'configs') + taskdir = os.path.join(expdir, '/'.join([task, setting, ''])) + config_dir = os.path.join(expdir, '/'.join([task, setting, 'configs', ''])) utils.makedir(config_dir) # default setting @@ -102,54 +216,54 @@ def generate_configs(taskdir, task, gpu): # generate config for each task task_config = default_config.copy() - task_config['gpu'] = gpu - task_config['task_name'] = task_config['task_name'].replace('cg', task) - task_config['model_path'] = task_config['model_path'].replace('cg', task) - task_config['saved_params'] = task_config['saved_params'].replace('cg', task) - task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', task) - # predict config - predict_dev_config = task_config.copy() - gen_predict_config(predict_dev_config, specific_config, 'dev', config_dir, task, taskdir) + # generate data path + task_config['train_data'] = ''.join(["data/corpora/", task, "/train/"]) + task_config['dev_data'] = ''.join(["data/corpora/", task, "/dev/"]) + task_config['test_data'] = ''.join(["data/corpora/", task, "/dev/"]) + # bert + task_config['bert_model'] = "data/bert/scibert_scivocab_cased" - predict_test_config = task_config.copy() - gen_predict_config(predict_test_config, specific_config, 'test', config_dir, task, taskdir) + # task specific + task_config['task_name'] = task + task_config['ner_eval_corpus'] = task + task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', task) + task_config['rule_dir'] = task_config['rule_dir'].replace('cg', task) - # for raw text - predict_test_config = task_config.copy() - gen_predict_config(predict_test_config, specific_config, 'raw-text', config_dir, task, taskdir) + # ner config + ner_config = task_config.copy() + gen_ner_config(ner_config, specific_config, config_dir, taskdir) - print('Generate configs: Done!') + # rel config + rel_config = task_config.copy() + gen_rel_config(rel_config, specific_config, config_dir, taskdir) - return + # ev config + ev_config = task_config.copy() + gen_ev_config(ev_config, specific_config, config_dir, taskdir) + # joint gold config + joint_config = task_config.copy() + gen_joint_config(joint_config, specific_config, config_dir, taskdir) -def generate_configs_pubmed(expdir, dataname, model_name, gpu): - """Generate configs for all.""" + # joint end-to-end config (predict entity) + joint_e2e_config = joint_config.copy() + gen_joint_e2e_config(joint_e2e_config, specific_config, config_dir, taskdir) - # create experiment dir - config_dir = os.path.join(expdir, ''.join([dataname, '/configs'])) - utils.makedir(config_dir) + # predict config + predict_dev_config = task_config.copy() + gen_predict_config(predict_dev_config, 'dev', config_dir, taskdir) - # default setting - default_config_path = 'configs/default.yaml' - with open(default_config_path, 'r') as stream: - default_config = utils._ordered_load(stream) + predict_test_config = task_config.copy() + gen_predict_config(predict_test_config, 'test', config_dir, taskdir) - # read config for specific task - specific_config = read_specific_config(model_name) + # predict end-to-end config - # generate config for each task - task_config = default_config.copy() - task_config['gpu'] = gpu - task_config['task_name'] = task_config['task_name'].replace('cg', model_name) - task_config['model_path'] = task_config['model_path'].replace('cg', model_name) - task_config['saved_params'] = task_config['saved_params'].replace('cg', model_name) - task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', model_name) + predict_e2e_dev_config = predict_dev_config.copy() + gen_predict_e2e_config(predict_e2e_dev_config, 'dev', config_dir, taskdir) - # for raw text - predict_test_config = task_config.copy() - gen_predict_config_pubmed(predict_test_config, specific_config, config_dir, expdir, dataname) + predict_e2e_test_config = predict_test_config.copy() + gen_predict_e2e_config(predict_e2e_test_config, 'test', config_dir, taskdir) print('Generate configs: Done!') @@ -157,12 +271,5 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): if __name__ == '__main__': - # generate_configs_pubmed("experiments/", "cg", "my-pubmed", 0) - - # bionlp - if len(sys.argv) == 4: - generate_configs(sys.argv[1], sys.argv[2], sys.argv[3]) - - # pubmed - elif len(sys.argv) == 5: - generate_configs_pubmed(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) + # generate_configs("experiments/", "cg", "basic") + generate_configs(sys.argv[1], sys.argv[2], sys.argv[3]) From 9fbf3a2069445bbf1d807738da9b204555200ef3 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:09:37 +0900 Subject: [PATCH 08/70] configs for debug mode --- README.md | 5 ++++ run/train/generate-configs-debug.sh | 13 ++++++++++ run/train/generate-configs.sh | 13 ++++++---- scripts/generate_configs.py | 40 ++++++++++++++++++++++++----- 4 files changed, 60 insertions(+), 11 deletions(-) create mode 100644 run/train/generate-configs-debug.sh diff --git a/README.md b/README.md index f8fba47..0b525a1 100644 --- a/README.md +++ b/README.md @@ -83,10 +83,15 @@ sh run/train/download-bert.sh ``` 3. Generate configs +- Configs for training CG task ```bash sh run/train/generate_configs.sh cg basic ``` - Experiment name: basic, exp1, exp2, etc +- Or running this debug mode (on a small data with several epochs) +```bash +sh run/train/generate_configs.sh cg debug +``` # 3. Predict (BioNLP tasks) diff --git a/run/train/generate-configs-debug.sh b/run/train/generate-configs-debug.sh new file mode 100644 index 0000000..503c7e6 --- /dev/null +++ b/run/train/generate-configs-debug.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +TASK=$1 # cg +EXPERIMENT_NAME=$2 # basic + +EXPERIMENT_DIR="experiments/" +mkdir -p $EXPERIMENT_DIR + +python scripts/generate_configs.py \ + --experiment_dir $EXPERIMENT_DIR \ + --task_name $TASK \ + --experiment_name $EXPERIMENT_NAME \ + --debug_mode \ No newline at end of file diff --git a/run/train/generate-configs.sh b/run/train/generate-configs.sh index c2df32d..fa79fc4 100644 --- a/run/train/generate-configs.sh +++ b/run/train/generate-configs.sh @@ -1,9 +1,12 @@ #!/bin/bash -experiment_dir="experiments/" -mkdir -p $experiment_dir +TASK=$1 # cg +EXPERIMENT_NAME=$2 # basic -task=$1 -setting=$2 +EXPERIMENT_DIR="experiments/" +mkdir -p $EXPERIMENT_DIR -python scripts/generate_configs.py $experiment_dir $task $setting +python scripts/generate_configs.py \ + --experiment_dir $EXPERIMENT_DIR \ + --task_name $TASK \ + --experiment_name $EXPERIMENT_NAME \ diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index b6707e9..d77339c 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -2,6 +2,7 @@ import os import sys +import argparse sys.path.insert(0, '.') from utils import utils @@ -198,15 +199,26 @@ def overwrite_task_config(config, specific_config): return config -def generate_configs(expdir, task, setting): +def set_debug_mode(configs, args): + if args.debug_mode: + if "train_data" in configs: + configs['train_data'] = configs['train_data'].replace('train', "debug") + if "dev_data" in configs: + configs['dev_data'] = configs['dev_data'].replace('dev', "debug") + if "test_data" in configs: + configs['test_data'] = configs['test_data'].replace('test', "debug") + if "epoch" in configs: + configs["epoch"] = 2 + +def generate_configs(args, expdir, task, exp_name): """Generate configs for all.""" # create experiment dir - taskdir = os.path.join(expdir, '/'.join([task, setting, ''])) - config_dir = os.path.join(expdir, '/'.join([task, setting, 'configs', ''])) + taskdir = os.path.join(expdir, '/'.join([task, exp_name, ''])) + config_dir = os.path.join(expdir, '/'.join([task, exp_name, 'configs', ''])) utils.makedir(config_dir) - # default setting + # default exp_name default_config_path = 'configs/default.yaml' with open(default_config_path, 'r') as stream: default_config = utils._ordered_load(stream) @@ -221,6 +233,10 @@ def generate_configs(expdir, task, setting): task_config['train_data'] = ''.join(["data/corpora/", task, "/train/"]) task_config['dev_data'] = ''.join(["data/corpora/", task, "/dev/"]) task_config['test_data'] = ''.join(["data/corpora/", task, "/dev/"]) + + # debug mode + set_debug_mode(task_config, args) + # bert task_config['bert_model'] = "data/bert/scibert_scivocab_cased" @@ -270,6 +286,18 @@ def generate_configs(expdir, task, setting): return +def main(arguments): + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_dir', help='Directory for experiments', type=str, default='experiments') + parser.add_argument('--task_name', help='Name of task', type=str, default='cg') + parser.add_argument('--experiment_name', help='Name of this experiment', type=str, + default='basic') + parser.add_argument("--debug_mode", action='store_true', + help="Run experiments on a small data for debugging quickly") + args = parser.parse_args(arguments) + + generate_configs(args, args.experiment_dir, args.task_name, args.experiment_name) + if __name__ == '__main__': - # generate_configs("experiments/", "cg", "basic") - generate_configs(sys.argv[1], sys.argv[2], sys.argv[3]) + # generate_configs("experiments/", "cg", "debug_mode") + main(sys.argv[1:]) From a83ddaa1e5fac60e3663db49f6b527b4da9b8de8 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:12:28 +0900 Subject: [PATCH 09/70] fix bug --- README.md | 2 +- scripts/generate_configs.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0b525a1..d9a3eff 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ sh run/train/generate_configs.sh cg basic - Experiment name: basic, exp1, exp2, etc - Or running this debug mode (on a small data with several epochs) ```bash -sh run/train/generate_configs.sh cg debug +sh run/train/generate_configs-debug.sh cg debug ``` # 3. Predict (BioNLP tasks) diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index d77339c..5975fea 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -268,17 +268,21 @@ def generate_configs(args, expdir, task, exp_name): # predict config predict_dev_config = task_config.copy() + set_debug_mode(predict_dev_config, args) gen_predict_config(predict_dev_config, 'dev', config_dir, taskdir) predict_test_config = task_config.copy() + set_debug_mode(predict_test_config, args) gen_predict_config(predict_test_config, 'test', config_dir, taskdir) # predict end-to-end config predict_e2e_dev_config = predict_dev_config.copy() + set_debug_mode(predict_e2e_dev_config, args) gen_predict_e2e_config(predict_e2e_dev_config, 'dev', config_dir, taskdir) predict_e2e_test_config = predict_test_config.copy() + set_debug_mode(predict_e2e_test_config, args) gen_predict_e2e_config(predict_e2e_test_config, 'test', config_dir, taskdir) print('Generate configs: Done!') From a1950b1c2c13befdf469f76a27a32ef09e86eced Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:17:04 +0900 Subject: [PATCH 10/70] fix bug --- scripts/generate_configs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index 5975fea..629a03c 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -151,6 +151,7 @@ def gen_predict_config(predict_config, eval_set, config_dir, taskdir): """For joint prediction""" predict_config['test_data'] = predict_config['test_data'].replace('dev', eval_set) + set_debug_mode(predict_config, args) predict_config['result_dir'] = ''.join([taskdir, 'predict-gold-', eval_set, '/']) predict_config['save_params'] = False predict_config['joint_model_dir'] = ''.join([taskdir, 'joint-gold/model/']) @@ -165,6 +166,7 @@ def gen_predict_e2e_config(predict_e2e_config, eval_set, config_dir, taskdir): """For joint end-to-end prediction""" predict_e2e_config['result_dir'] = ''.join([taskdir, 'predict-e2e-', eval_set, '/']) + set_debug_mode(predict_e2e_config, args) predict_e2e_config['joint_model_dir'] = ''.join([taskdir, 'joint-e2e/model/']) predict_e2e_config['params'] = ''.join([taskdir, 'joint-e2e/', predict_e2e_config['task_name'], '.param']) predict_e2e_config['ner_predict_all'] = True @@ -268,21 +270,17 @@ def generate_configs(args, expdir, task, exp_name): # predict config predict_dev_config = task_config.copy() - set_debug_mode(predict_dev_config, args) gen_predict_config(predict_dev_config, 'dev', config_dir, taskdir) predict_test_config = task_config.copy() - set_debug_mode(predict_test_config, args) gen_predict_config(predict_test_config, 'test', config_dir, taskdir) # predict end-to-end config predict_e2e_dev_config = predict_dev_config.copy() - set_debug_mode(predict_e2e_dev_config, args) gen_predict_e2e_config(predict_e2e_dev_config, 'dev', config_dir, taskdir) predict_e2e_test_config = predict_test_config.copy() - set_debug_mode(predict_e2e_test_config, args) gen_predict_e2e_config(predict_e2e_test_config, 'test', config_dir, taskdir) print('Generate configs: Done!') From d9c7bcbbe3d98652f929976778fb8d13c04224ec Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:18:12 +0900 Subject: [PATCH 11/70] fix bug --- scripts/generate_configs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index 629a03c..4d41e80 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -147,7 +147,7 @@ def gen_joint_e2e_config(joint_e2e_config, task_config, config_dir, taskdir): write_config(os.path.join(config_dir, 'train-joint-e2e.yaml'), joint_e2e_config) -def gen_predict_config(predict_config, eval_set, config_dir, taskdir): +def gen_predict_config(args, predict_config, eval_set, config_dir, taskdir): """For joint prediction""" predict_config['test_data'] = predict_config['test_data'].replace('dev', eval_set) @@ -162,7 +162,7 @@ def gen_predict_config(predict_config, eval_set, config_dir, taskdir): write_config(os.path.join(config_dir, ''.join(['predict-gold-', eval_set, '.yaml'])), predict_config) -def gen_predict_e2e_config(predict_e2e_config, eval_set, config_dir, taskdir): +def gen_predict_e2e_config(args, predict_e2e_config, eval_set, config_dir, taskdir): """For joint end-to-end prediction""" predict_e2e_config['result_dir'] = ''.join([taskdir, 'predict-e2e-', eval_set, '/']) @@ -270,18 +270,18 @@ def generate_configs(args, expdir, task, exp_name): # predict config predict_dev_config = task_config.copy() - gen_predict_config(predict_dev_config, 'dev', config_dir, taskdir) + gen_predict_config(args, predict_dev_config, 'dev', config_dir, taskdir) predict_test_config = task_config.copy() - gen_predict_config(predict_test_config, 'test', config_dir, taskdir) + gen_predict_config(args, predict_test_config, 'test', config_dir, taskdir) # predict end-to-end config predict_e2e_dev_config = predict_dev_config.copy() - gen_predict_e2e_config(predict_e2e_dev_config, 'dev', config_dir, taskdir) + gen_predict_e2e_config(args, predict_e2e_dev_config, 'dev', config_dir, taskdir) predict_e2e_test_config = predict_test_config.copy() - gen_predict_e2e_config(predict_e2e_test_config, 'test', config_dir, taskdir) + gen_predict_e2e_config(args, predict_e2e_test_config, 'test', config_dir, taskdir) print('Generate configs: Done!') From f5ca6f671b2b57d1aaf12df55e518cd4c9d50ac1 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:21:30 +0900 Subject: [PATCH 12/70] fix bugs --- scripts/generate_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index 4d41e80..130b1b5 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -207,7 +207,9 @@ def set_debug_mode(configs, args): configs['train_data'] = configs['train_data'].replace('train', "debug") if "dev_data" in configs: configs['dev_data'] = configs['dev_data'].replace('dev', "debug") + configs['dev_data'] = configs['dev_data'].replace('test', "debug") if "test_data" in configs: + configs['test_data'] = configs['test_data'].replace('dev', "debug") configs['test_data'] = configs['test_data'].replace('test', "debug") if "epoch" in configs: configs["epoch"] = 2 From 63e73b696bc1372acb3c914913ae8d0e3262318e Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:24:51 +0900 Subject: [PATCH 13/70] cg debug data --- data/corpora/cg/debug/PMID-1388088.a1 | 21 + data/corpora/cg/debug/PMID-1388088.a2 | 28 + data/corpora/cg/debug/PMID-1388088.ann | 64 + data/corpora/cg/debug/PMID-1388088.inv.map | 1100 ++++++++++++ data/corpora/cg/debug/PMID-1388088.map | 912 ++++++++++ data/corpora/cg/debug/PMID-1388088.txt | 5 + data/corpora/cg/debug/PMID-1388088.txt.ori | 2 + data/corpora/cg/debug/PMID-198130.a1 | 35 + data/corpora/cg/debug/PMID-198130.a2 | 39 + data/corpora/cg/debug/PMID-198130.ann | 98 ++ data/corpora/cg/debug/PMID-198130.inv.map | 1779 ++++++++++++++++++++ data/corpora/cg/debug/PMID-198130.map | 1513 +++++++++++++++++ data/corpora/cg/debug/PMID-198130.txt | 6 + data/corpora/cg/debug/PMID-198130.txt.ori | 2 + 14 files changed, 5604 insertions(+) create mode 100644 data/corpora/cg/debug/PMID-1388088.a1 create mode 100644 data/corpora/cg/debug/PMID-1388088.a2 create mode 100644 data/corpora/cg/debug/PMID-1388088.ann create mode 100644 data/corpora/cg/debug/PMID-1388088.inv.map create mode 100644 data/corpora/cg/debug/PMID-1388088.map create mode 100644 data/corpora/cg/debug/PMID-1388088.txt create mode 100644 data/corpora/cg/debug/PMID-1388088.txt.ori create mode 100644 data/corpora/cg/debug/PMID-198130.a1 create mode 100644 data/corpora/cg/debug/PMID-198130.a2 create mode 100644 data/corpora/cg/debug/PMID-198130.ann create mode 100644 data/corpora/cg/debug/PMID-198130.inv.map create mode 100644 data/corpora/cg/debug/PMID-198130.map create mode 100644 data/corpora/cg/debug/PMID-198130.txt create mode 100644 data/corpora/cg/debug/PMID-198130.txt.ori diff --git a/data/corpora/cg/debug/PMID-1388088.a1 b/data/corpora/cg/debug/PMID-1388088.a1 new file mode 100644 index 0000000..5472964 --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.a1 @@ -0,0 +1,21 @@ +T1 Gene_or_gene_product 0 35 Interleukin - 1 receptor antagonist +T2 Cell 71 79 neuronal +T3 Organism 94 97 rat +T4 Gene_or_gene_product 100 115 Interleukin - 1 +T5 Gene_or_gene_product 118 124 IL - 1 +T6 Organ 144 149 brain +T7 Gene_or_gene_product 189 195 IL - 1 +T8 Cell 284 292 neuronal +T9 Organ 320 328 cerebral +T10 Multi-tissue_structure 348 363 cerebral artery +T11 Organism 425 429 rats +T12 Gene_or_gene_product 458 484 IL - 1 receptor antagonist +T13 Gene_or_gene_product 487 495 IL - 1ra +T14 Multi-tissue_structure 589 597 striatal +T15 Simple_chemical 613 617 NMDA +T16 Simple_chemical 639 669 cis - 2 , 4 - methanoglutamate +T17 Gene_or_gene_product 729 737 IL - 1ra +T18 Gene_or_gene_product 776 782 IL - 1 +T19 Organ 826 831 brain +T20 Gene_or_gene_product 864 870 IL - 1 +T21 Cell 943 951 neuronal diff --git a/data/corpora/cg/debug/PMID-1388088.a2 b/data/corpora/cg/debug/PMID-1388088.a2 new file mode 100644 index 0000000..539aed4 --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.a2 @@ -0,0 +1,28 @@ +* Equiv T12 T13 +* Equiv T4 T5 +T22 Negative_regulation 36 44 inhibits +T23 Breakdown 80 86 damage +T24 Gene_expression 127 136 synthesis +T25 Positive_regulation 153 163 stimulated +T26 Blood_vessel_development 248 266 neovascularization +T27 Death 293 298 death +T28 Negative_regulation 400 409 inhibited +T29 Planned_process 430 438 injected +T30 Planned_process 598 606 infusion +T31 Planned_process 712 721 injection +T32 Regulation 788 796 mediator +T33 Breakdown 832 838 damage +T34 Death 952 957 death +E1 Negative_regulation:T22 Theme:E2 Cause:T1 +E2 Breakdown:T23 Theme:T2 +E3 Gene_expression:T24 Theme:T4 +E4 Positive_regulation:T25 Theme:E3 +E5 Blood_vessel_development:T26 +E6 Death:T27 Theme:T8 +E7 Negative_regulation:T28 Theme:E6 +E8 Planned_process:T29 Instrument:T12 Theme:T11 +E9 Planned_process:T30 Instrument:T16 +E10 Planned_process:T31 Instrument:T17 +E11 Regulation:T32 Cause:T18 Theme:E12 +E12 Breakdown:T33 Theme:T19 +E13 Death:T34 Theme:T21 diff --git a/data/corpora/cg/debug/PMID-1388088.ann b/data/corpora/cg/debug/PMID-1388088.ann new file mode 100644 index 0000000..f8e5d6f --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.ann @@ -0,0 +1,64 @@ +* Equiv T12 T13 +* Equiv T4 T5 +T1 Gene_or_gene_product 0 35 Interleukin - 1 receptor antagonist +T2 Cell 71 79 neuronal +T3 Organism 94 97 rat +T4 Gene_or_gene_product 100 115 Interleukin - 1 +T5 Gene_or_gene_product 118 124 IL - 1 +T6 Organ 144 149 brain +T7 Gene_or_gene_product 189 195 IL - 1 +T8 Cell 284 292 neuronal +T9 Organ 320 328 cerebral +T10 Multi-tissue_structure 348 363 cerebral artery +T11 Organism 425 429 rats +T12 Gene_or_gene_product 458 484 IL - 1 receptor antagonist +T13 Gene_or_gene_product 487 495 IL - 1ra +T14 Multi-tissue_structure 589 597 striatal +T15 Simple_chemical 613 617 NMDA +T16 Simple_chemical 639 669 cis - 2 , 4 - methanoglutamate +T17 Gene_or_gene_product 729 737 IL - 1ra +T18 Gene_or_gene_product 776 782 IL - 1 +T19 Organ 826 831 brain +T20 Gene_or_gene_product 864 870 IL - 1 +T21 Cell 943 951 neuronal +TR22 Negative_regulation 36 44 inhibits +TR23 Breakdown 80 86 damage +TR24 Gene_expression 127 136 synthesis +TR25 Positive_regulation 153 163 stimulated +TR26 Blood_vessel_development 248 266 neovascularization +TR27 Death 293 298 death +TR28 Negative_regulation 400 409 inhibited +TR29 Planned_process 430 438 injected +TR30 Planned_process 598 606 infusion +TR31 Planned_process 712 721 injection +TR32 Regulation 788 796 mediator +TR33 Breakdown 832 838 damage +TR34 Death 952 957 death +R1 Theme Arg1:TR22 Arg2:TR23 +R2 Theme Arg1:TR25 Arg2:TR24 +R3 Theme Arg1:TR33 Arg2:T19 +R4 Cause Arg1:TR32 Arg2:T18 +R5 Instrument Arg1:TR31 Arg2:T17 +R6 Cause Arg1:TR22 Arg2:T1 +R7 Instrument Arg1:TR30 Arg2:T16 +R8 Instrument Arg1:TR29 Arg2:T12 +R9 Theme Arg1:TR24 Arg2:T4 +R10 Theme Arg1:TR27 Arg2:T8 +R11 Theme Arg1:TR29 Arg2:T11 +R12 Theme Arg1:TR28 Arg2:TR27 +R13 Theme Arg1:TR23 Arg2:T2 +R14 Theme Arg1:TR32 Arg2:TR33 +R15 Theme Arg1:TR34 Arg2:T21 +E1 Negative_regulation:TR22 Theme:E2 Cause:T1 +E2 Breakdown:TR23 Theme:T2 +E3 Gene_expression:TR24 Theme:T4 +E4 Positive_regulation:TR25 Theme:E3 +E5 Blood_vessel_development:TR26 +E6 Death:TR27 Theme:T8 +E7 Negative_regulation:TR28 Theme:E6 +E8 Planned_process:TR29 Instrument:T12 Theme:T11 +E9 Planned_process:TR30 Instrument:T16 +E10 Planned_process:TR31 Instrument:T17 +E11 Regulation:TR32 Cause:T18 Theme:E12 +E12 Breakdown:TR33 Theme:T19 +E13 Death:TR34 Theme:T21 diff --git a/data/corpora/cg/debug/PMID-1388088.inv.map b/data/corpora/cg/debug/PMID-1388088.inv.map new file mode 100644 index 0000000..33ee54c --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.inv.map @@ -0,0 +1,1100 @@ +{ + "0": 0, + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + "10": 10, + "11": 11, + "12": 11, + "13": 12, + "14": 12, + "15": 13, + "16": 14, + "17": 15, + "18": 16, + "19": 17, + "20": 18, + "21": 19, + "22": 20, + "23": 21, + "24": 22, + "25": 23, + "26": 24, + "27": 25, + "28": 26, + "29": 27, + "30": 28, + "31": 29, + "32": 30, + "33": 31, + "34": 32, + "35": 33, + "36": 34, + "37": 35, + "38": 36, + "39": 37, + "40": 38, + "41": 39, + "42": 40, + "43": 41, + "44": 42, + "45": 43, + "46": 44, + "47": 45, + "48": 46, + "49": 47, + "50": 48, + "51": 49, + "52": 50, + "53": 51, + "54": 52, + "55": 53, + "56": 54, + "57": 55, + "58": 56, + "59": 57, + "60": 58, + "61": 59, + "62": 60, + "63": 61, + "64": 62, + "65": 63, + "66": 64, + "67": 65, + "68": 66, + "69": 67, + "70": 68, + "71": 69, + "72": 70, + "73": 71, + "74": 72, + "75": 73, + "76": 74, + "77": 75, + "78": 76, + "79": 77, + "80": 78, + "81": 79, + "82": 80, + "83": 81, + "84": 82, + "85": 83, + "86": 84, + "87": 85, + "88": 86, + "89": 87, + "90": 88, + "91": 89, + "92": 90, + "93": 91, + "94": 92, + "95": 93, + "96": 94, + "97": 95, + "98": 95, + "99": 96, + "100": 97, + "101": 98, + "102": 99, + "103": 100, + "104": 101, + "105": 102, + "106": 103, + "107": 104, + "108": 105, + "109": 106, + "110": 107, + "111": 108, + "112": 108, + "113": 109, + "114": 109, + "115": 110, + "116": 111, + "117": 112, + "118": 112, + "119": 113, + "120": 114, + "121": 114, + "122": 115, + "123": 115, + "124": 116, + "125": 116, + "126": 117, + "127": 118, + "128": 119, + "129": 120, + "130": 121, + "131": 122, + "132": 123, + "133": 124, + "134": 125, + "135": 126, + "136": 127, + "137": 128, + "138": 129, + "139": 130, + "140": 131, + "141": 132, + "142": 133, + "143": 134, + "144": 135, + "145": 136, + "146": 137, + "147": 138, + "148": 139, + "149": 140, + "150": 141, + "151": 142, + "152": 143, + "153": 144, + "154": 145, + "155": 146, + "156": 147, + "157": 148, + "158": 149, + "159": 150, + "160": 151, + "161": 152, + "162": 153, + "163": 154, + "164": 155, + "165": 156, + "166": 157, + "167": 158, + "168": 159, + "169": 160, + "170": 161, + "171": 162, + "172": 163, + "173": 164, + "174": 165, + "175": 166, + "176": 167, + "177": 168, + "178": 169, + "179": 170, + "180": 171, + "181": 172, + "182": 173, + "183": 174, + "184": 175, + "185": 176, + "186": 177, + "187": 178, + "188": 179, + "189": 180, + "190": 181, + "191": 182, + "192": 182, + "193": 183, + "194": 183, + "195": 184, + "196": 185, + "197": 186, + "198": 187, + "199": 188, + "200": 189, + "201": 190, + "202": 191, + "203": 192, + "204": 193, + "205": 194, + "206": 195, + "207": 196, + "208": 197, + "209": 198, + "210": 199, + "211": 200, + "212": 201, + "213": 202, + "214": 203, + "215": 204, + "216": 205, + "217": 206, + "218": 207, + "219": 208, + "220": 209, + "221": 210, + "222": 211, + "223": 212, + "224": 213, + "225": 214, + "226": 214, + "227": 215, + "228": 216, + "229": 217, + "230": 218, + "231": 219, + "232": 220, + "233": 221, + "234": 222, + "235": 223, + "236": 224, + "237": 225, + "238": 226, + "239": 227, + "240": 228, + "241": 229, + "242": 230, + "243": 231, + "244": 232, + "245": 233, + "246": 234, + "247": 235, + "248": 236, + "249": 237, + "250": 238, + "251": 239, + "252": 240, + "253": 241, + "254": 242, + "255": 243, + "256": 244, + "257": 245, + "258": 246, + "259": 247, + "260": 248, + "261": 249, + "262": 250, + "263": 251, + "264": 252, + "265": 253, + "266": 254, + "267": 254, + "268": 255, + "269": 256, + "270": 257, + "271": 258, + "272": 259, + "273": 260, + "274": 261, + "275": 262, + "276": 263, + "277": 264, + "278": 265, + "279": 266, + "280": 267, + "281": 268, + "282": 269, + "283": 270, + "284": 271, + "285": 272, + "286": 273, + "287": 274, + "288": 275, + "289": 276, + "290": 277, + "291": 278, + "292": 279, + "293": 280, + "294": 281, + "295": 282, + "296": 283, + "297": 284, + "298": 285, + "299": 286, + "300": 287, + "301": 288, + "302": 289, + "303": 290, + "304": 291, + "305": 292, + "306": 293, + "307": 294, + "308": 295, + "309": 296, + "310": 297, + "311": 298, + "312": 299, + "313": 300, + "314": 301, + "315": 302, + "316": 303, + "317": 304, + "318": 305, + "319": 306, + "320": 307, + "321": 308, + "322": 309, + "323": 310, + "324": 311, + "325": 312, + "326": 313, + "327": 314, + "328": 315, + "329": 316, + "330": 317, + "331": 318, + "332": 319, + "333": 320, + "334": 321, + "335": 322, + "336": 323, + "337": 324, + "338": 325, + "339": 326, + "340": 327, + "341": 327, + "342": 328, + "343": 329, + "344": 330, + "345": 331, + "346": 332, + "347": 333, + "348": 334, + "349": 335, + "350": 336, + "351": 337, + "352": 338, + "353": 339, + "354": 340, + "355": 341, + "356": 342, + "357": 343, + "358": 344, + "359": 345, + "360": 346, + "361": 347, + "362": 348, + "363": 349, + "364": 350, + "365": 351, + "366": 352, + "367": 353, + "368": 354, + "369": 355, + "370": 356, + "371": 357, + "372": 358, + "373": 359, + "374": 359, + "375": 360, + "376": 361, + "377": 362, + "378": 363, + "379": 364, + "380": 365, + "381": 365, + "382": 366, + "383": 367, + "384": 368, + "385": 369, + "386": 370, + "387": 371, + "388": 372, + "389": 373, + "390": 374, + "391": 375, + "392": 376, + "393": 377, + "394": 378, + "395": 379, + "396": 380, + "397": 381, + "398": 382, + "399": 383, + "400": 384, + "401": 385, + "402": 386, + "403": 387, + "404": 388, + "405": 389, + "406": 390, + "407": 391, + "408": 392, + "409": 393, + "410": 394, + "411": 395, + "412": 395, + "413": 396, + "414": 397, + "415": 398, + "416": 399, + "417": 400, + "418": 400, + "419": 401, + "420": 401, + "421": 402, + "422": 403, + "423": 404, + "424": 405, + "425": 406, + "426": 407, + "427": 408, + "428": 409, + "429": 410, + "430": 411, + "431": 412, + "432": 413, + "433": 414, + "434": 415, + "435": 416, + "436": 417, + "437": 418, + "438": 419, + "439": 420, + "440": 421, + "441": 422, + "442": 423, + "443": 424, + "444": 425, + "445": 426, + "446": 427, + "447": 428, + "448": 429, + "449": 430, + "450": 431, + "451": 432, + "452": 433, + "453": 434, + "454": 435, + "455": 436, + "456": 437, + "457": 438, + "458": 439, + "459": 440, + "460": 441, + "461": 441, + "462": 442, + "463": 442, + "464": 443, + "465": 444, + "466": 445, + "467": 446, + "468": 447, + "469": 448, + "470": 449, + "471": 450, + "472": 451, + "473": 452, + "474": 453, + "475": 454, + "476": 455, + "477": 456, + "478": 457, + "479": 458, + "480": 459, + "481": 460, + "482": 461, + "483": 462, + "484": 463, + "485": 464, + "486": 465, + "487": 465, + "488": 466, + "489": 467, + "490": 467, + "491": 468, + "492": 468, + "493": 469, + "494": 470, + "495": 471, + "496": 471, + "497": 472, + "498": 473, + "499": 474, + "500": 475, + "501": 476, + "502": 477, + "503": 478, + "504": 479, + "505": 480, + "506": 481, + "507": 482, + "508": 483, + "509": 484, + "510": 485, + "511": 486, + "512": 486, + "513": 487, + "514": 488, + "515": 489, + "516": 490, + "517": 491, + "518": 492, + "519": 493, + "520": 494, + "521": 495, + "522": 496, + "523": 497, + "524": 498, + "525": 499, + "526": 500, + "527": 501, + "528": 502, + "529": 503, + "530": 504, + "531": 505, + "532": 506, + "533": 507, + "534": 508, + "535": 509, + "536": 510, + "537": 511, + "538": 512, + "539": 513, + "540": 514, + "541": 515, + "542": 516, + "543": 517, + "544": 518, + "545": 519, + "546": 520, + "547": 521, + "548": 522, + "549": 523, + "550": 524, + "551": 525, + "552": 526, + "553": 527, + "554": 528, + "555": 529, + "556": 530, + "557": 531, + "558": 532, + "559": 532, + "560": 533, + "561": 533, + "562": 534, + "563": 535, + "564": 536, + "565": 537, + "566": 538, + "567": 539, + "568": 540, + "569": 541, + "570": 542, + "571": 543, + "572": 544, + "573": 545, + "574": 546, + "575": 547, + "576": 548, + "577": 549, + "578": 550, + "579": 551, + "580": 552, + "581": 553, + "582": 554, + "583": 555, + "584": 556, + "585": 557, + "586": 558, + "587": 559, + "588": 560, + "589": 561, + "590": 562, + "591": 563, + "592": 564, + "593": 565, + "594": 566, + "595": 567, + "596": 568, + "597": 569, + "598": 570, + "599": 571, + "600": 572, + "601": 573, + "602": 574, + "603": 575, + "604": 576, + "605": 577, + "606": 578, + "607": 579, + "608": 580, + "609": 581, + "610": 582, + "611": 583, + "612": 584, + "613": 585, + "614": 586, + "615": 587, + "616": 588, + "617": 589, + "618": 589, + "619": 590, + "620": 590, + "621": 591, + "622": 592, + "623": 593, + "624": 594, + "625": 595, + "626": 596, + "627": 597, + "628": 598, + "629": 599, + "630": 600, + "631": 601, + "632": 602, + "633": 603, + "634": 604, + "635": 605, + "636": 606, + "637": 607, + "638": 608, + "639": 608, + "640": 609, + "641": 610, + "642": 611, + "643": 611, + "644": 612, + "645": 612, + "646": 613, + "647": 613, + "648": 614, + "649": 614, + "650": 615, + "651": 615, + "652": 616, + "653": 616, + "654": 617, + "655": 618, + "656": 619, + "657": 620, + "658": 621, + "659": 622, + "660": 623, + "661": 624, + "662": 625, + "663": 626, + "664": 627, + "665": 628, + "666": 629, + "667": 630, + "668": 631, + "669": 632, + "670": 632, + "671": 633, + "672": 634, + "673": 635, + "674": 636, + "675": 637, + "676": 638, + "677": 639, + "678": 640, + "679": 641, + "680": 642, + "681": 643, + "682": 644, + "683": 645, + "684": 646, + "685": 647, + "686": 648, + "687": 649, + "688": 650, + "689": 651, + "690": 652, + "691": 653, + "692": 654, + "693": 655, + "694": 656, + "695": 657, + "696": 658, + "697": 659, + "698": 660, + "699": 661, + "700": 662, + "701": 663, + "702": 663, + "703": 664, + "704": 665, + "705": 665, + "706": 666, + "707": 666, + "708": 667, + "709": 668, + "710": 669, + "711": 670, + "712": 671, + "713": 672, + "714": 673, + "715": 674, + "716": 675, + "717": 676, + "718": 677, + "719": 678, + "720": 679, + "721": 680, + "722": 681, + "723": 682, + "724": 683, + "725": 684, + "726": 685, + "727": 686, + "728": 687, + "729": 688, + "730": 689, + "731": 690, + "732": 690, + "733": 691, + "734": 691, + "735": 692, + "736": 693, + "737": 694, + "738": 694, + "739": 695, + "740": 696, + "741": 697, + "742": 698, + "743": 699, + "744": 700, + "745": 701, + "746": 702, + "747": 703, + "748": 704, + "749": 705, + "750": 706, + "751": 707, + "752": 708, + "753": 709, + "754": 710, + "755": 711, + "756": 712, + "757": 713, + "758": 714, + "759": 715, + "760": 716, + "761": 717, + "762": 718, + "763": 719, + "764": 720, + "765": 721, + "766": 722, + "767": 723, + "768": 724, + "769": 725, + "770": 726, + "771": 727, + "772": 728, + "773": 729, + "774": 730, + "775": 731, + "776": 732, + "777": 733, + "778": 734, + "779": 734, + "780": 735, + "781": 735, + "782": 736, + "783": 737, + "784": 738, + "785": 739, + "786": 740, + "787": 741, + "788": 742, + "789": 743, + "790": 744, + "791": 745, + "792": 746, + "793": 747, + "794": 748, + "795": 749, + "796": 750, + "797": 751, + "798": 752, + "799": 753, + "800": 754, + "801": 755, + "802": 756, + "803": 757, + "804": 758, + "805": 759, + "806": 760, + "807": 761, + "808": 762, + "809": 763, + "810": 764, + "811": 765, + "812": 766, + "813": 767, + "814": 768, + "815": 769, + "816": 770, + "817": 771, + "818": 772, + "819": 773, + "820": 774, + "821": 775, + "822": 776, + "823": 777, + "824": 778, + "825": 779, + "826": 780, + "827": 781, + "828": 782, + "829": 783, + "830": 784, + "831": 785, + "832": 786, + "833": 787, + "834": 788, + "835": 789, + "836": 790, + "837": 791, + "838": 792, + "839": 792, + "840": 793, + "841": 794, + "842": 795, + "843": 796, + "844": 797, + "845": 798, + "846": 799, + "847": 800, + "848": 801, + "849": 802, + "850": 803, + "851": 804, + "852": 805, + "853": 806, + "854": 807, + "855": 808, + "856": 809, + "857": 810, + "858": 811, + "859": 812, + "860": 813, + "861": 814, + "862": 815, + "863": 816, + "864": 817, + "865": 818, + "866": 819, + "867": 819, + "868": 820, + "869": 820, + "870": 821, + "871": 822, + "872": 823, + "873": 824, + "874": 825, + "875": 826, + "876": 827, + "877": 828, + "878": 829, + "879": 830, + "880": 831, + "881": 832, + "882": 833, + "883": 834, + "884": 835, + "885": 836, + "886": 837, + "887": 838, + "888": 839, + "889": 840, + "890": 841, + "891": 842, + "892": 843, + "893": 844, + "894": 845, + "895": 846, + "896": 847, + "897": 848, + "898": 849, + "899": 850, + "900": 851, + "901": 852, + "902": 853, + "903": 854, + "904": 855, + "905": 856, + "906": 857, + "907": 858, + "908": 859, + "909": 860, + "910": 861, + "911": 862, + "912": 863, + "913": 864, + "914": 865, + "915": 866, + "916": 867, + "917": 868, + "918": 869, + "919": 870, + "920": 871, + "921": 872, + "922": 873, + "923": 874, + "924": 875, + "925": 876, + "926": 877, + "927": 878, + "928": 879, + "929": 880, + "930": 881, + "931": 882, + "932": 883, + "933": 884, + "934": 885, + "935": 886, + "936": 887, + "937": 888, + "938": 889, + "939": 890, + "940": 891, + "941": 892, + "942": 893, + "943": 894, + "944": 895, + "945": 896, + "946": 897, + "947": 898, + "948": 899, + "949": 900, + "950": 901, + "951": 902, + "952": 903, + "953": 904, + "954": 905, + "955": 906, + "956": 907, + "957": 908, + "958": 908, + "959": 909, + "entities": { + "T1": [ + 0, + 33 + ], + "T2": [ + 69, + 77 + ], + "T3": [ + 92, + 95 + ], + "T4": [ + 97, + 110 + ], + "T5": [ + 112, + 116 + ], + "T6": [ + 135, + 140 + ], + "T7": [ + 180, + 184 + ], + "T8": [ + 271, + 279 + ], + "T9": [ + 307, + 315 + ], + "T10": [ + 334, + 349 + ], + "T11": [ + 406, + 410 + ], + "T12": [ + 439, + 463 + ], + "T13": [ + 465, + 471 + ], + "T14": [ + 561, + 569 + ], + "T15": [ + 585, + 589 + ], + "T16": [ + 608, + 632 + ], + "T17": [ + 688, + 694 + ], + "T18": [ + 732, + 736 + ], + "T19": [ + 780, + 785 + ], + "T20": [ + 817, + 821 + ], + "T21": [ + 894, + 902 + ], + "T22": [ + 34, + 42 + ], + "T23": [ + 78, + 84 + ], + "T24": [ + 118, + 127 + ], + "T25": [ + 144, + 154 + ], + "T26": [ + 236, + 254 + ], + "T27": [ + 280, + 285 + ], + "T28": [ + 384, + 393 + ], + "T29": [ + 411, + 419 + ], + "T30": [ + 570, + 578 + ], + "T31": [ + 671, + 680 + ], + "T32": [ + 742, + 750 + ], + "T33": [ + 786, + 792 + ], + "T34": [ + 903, + 908 + ] + } +} \ No newline at end of file diff --git a/data/corpora/cg/debug/PMID-1388088.map b/data/corpora/cg/debug/PMID-1388088.map new file mode 100644 index 0000000..3c4e4ae --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.map @@ -0,0 +1,912 @@ +{ + "0": 0, + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + "10": 10, + "11": 12, + "12": 14, + "13": 15, + "14": 16, + "15": 17, + "16": 18, + "17": 19, + "18": 20, + "19": 21, + "20": 22, + "21": 23, + "22": 24, + "23": 25, + "24": 26, + "25": 27, + "26": 28, + "27": 29, + "28": 30, + "29": 31, + "30": 32, + "31": 33, + "32": 34, + "33": 35, + "34": 36, + "35": 37, + "36": 38, + "37": 39, + "38": 40, + "39": 41, + "40": 42, + "41": 43, + "42": 44, + "43": 45, + "44": 46, + "45": 47, + "46": 48, + "47": 49, + "48": 50, + "49": 51, + "50": 52, + "51": 53, + "52": 54, + "53": 55, + "54": 56, + "55": 57, + "56": 58, + "57": 59, + "58": 60, + "59": 61, + "60": 62, + "61": 63, + "62": 64, + "63": 65, + "64": 66, + "65": 67, + "66": 68, + "67": 69, + "68": 70, + "69": 71, + "70": 72, + "71": 73, + "72": 74, + "73": 75, + "74": 76, + "75": 77, + "76": 78, + "77": 79, + "78": 80, + "79": 81, + "80": 82, + "81": 83, + "82": 84, + "83": 85, + "84": 86, + "85": 87, + "86": 88, + "87": 89, + "88": 90, + "89": 91, + "90": 92, + "91": 93, + "92": 94, + "93": 95, + "94": 96, + "95": 98, + "96": 99, + "97": 100, + "98": 101, + "99": 102, + "100": 103, + "101": 104, + "102": 105, + "103": 106, + "104": 107, + "105": 108, + "106": 109, + "107": 110, + "108": 112, + "109": 114, + "110": 115, + "111": 116, + "112": 118, + "113": 119, + "114": 121, + "115": 123, + "116": 125, + "117": 126, + "118": 127, + "119": 128, + "120": 129, + "121": 130, + "122": 131, + "123": 132, + "124": 133, + "125": 134, + "126": 135, + "127": 136, + "128": 137, + "129": 138, + "130": 139, + "131": 140, + "132": 141, + "133": 142, + "134": 143, + "135": 144, + "136": 145, + "137": 146, + "138": 147, + "139": 148, + "140": 149, + "141": 150, + "142": 151, + "143": 152, + "144": 153, + "145": 154, + "146": 155, + "147": 156, + "148": 157, + "149": 158, + "150": 159, + "151": 160, + "152": 161, + "153": 162, + "154": 163, + "155": 164, + "156": 165, + "157": 166, + "158": 167, + "159": 168, + "160": 169, + "161": 170, + "162": 171, + "163": 172, + "164": 173, + "165": 174, + "166": 175, + "167": 176, + "168": 177, + "169": 178, + "170": 179, + "171": 180, + "172": 181, + "173": 182, + "174": 183, + "175": 184, + "176": 185, + "177": 186, + "178": 187, + "179": 188, + "180": 189, + "181": 190, + "182": 192, + "183": 194, + "184": 195, + "185": 196, + "186": 197, + "187": 198, + "188": 199, + "189": 200, + "190": 201, + "191": 202, + "192": 203, + "193": 204, + "194": 205, + "195": 206, + "196": 207, + "197": 208, + "198": 209, + "199": 210, + "200": 211, + "201": 212, + "202": 213, + "203": 214, + "204": 215, + "205": 216, + "206": 217, + "207": 218, + "208": 219, + "209": 220, + "210": 221, + "211": 222, + "212": 223, + "213": 224, + "214": 226, + "215": 227, + "216": 228, + "217": 229, + "218": 230, + "219": 231, + "220": 232, + "221": 233, + "222": 234, + "223": 235, + "224": 236, + "225": 237, + "226": 238, + "227": 239, + "228": 240, + "229": 241, + "230": 242, + "231": 243, + "232": 244, + "233": 245, + "234": 246, + "235": 247, + "236": 248, + "237": 249, + "238": 250, + "239": 251, + "240": 252, + "241": 253, + "242": 254, + "243": 255, + "244": 256, + "245": 257, + "246": 258, + "247": 259, + "248": 260, + "249": 261, + "250": 262, + "251": 263, + "252": 264, + "253": 265, + "254": 267, + "255": 268, + "256": 269, + "257": 270, + "258": 271, + "259": 272, + "260": 273, + "261": 274, + "262": 275, + "263": 276, + "264": 277, + "265": 278, + "266": 279, + "267": 280, + "268": 281, + "269": 282, + "270": 283, + "271": 284, + "272": 285, + "273": 286, + "274": 287, + "275": 288, + "276": 289, + "277": 290, + "278": 291, + "279": 292, + "280": 293, + "281": 294, + "282": 295, + "283": 296, + "284": 297, + "285": 298, + "286": 299, + "287": 300, + "288": 301, + "289": 302, + "290": 303, + "291": 304, + "292": 305, + "293": 306, + "294": 307, + "295": 308, + "296": 309, + "297": 310, + "298": 311, + "299": 312, + "300": 313, + "301": 314, + "302": 315, + "303": 316, + "304": 317, + "305": 318, + "306": 319, + "307": 320, + "308": 321, + "309": 322, + "310": 323, + "311": 324, + "312": 325, + "313": 326, + "314": 327, + "315": 328, + "316": 329, + "317": 330, + "318": 331, + "319": 332, + "320": 333, + "321": 334, + "322": 335, + "323": 336, + "324": 337, + "325": 338, + "326": 339, + "327": 341, + "328": 342, + "329": 343, + "330": 344, + "331": 345, + "332": 346, + "333": 347, + "334": 348, + "335": 349, + "336": 350, + "337": 351, + "338": 352, + "339": 353, + "340": 354, + "341": 355, + "342": 356, + "343": 357, + "344": 358, + "345": 359, + "346": 360, + "347": 361, + "348": 362, + "349": 363, + "350": 364, + "351": 365, + "352": 366, + "353": 367, + "354": 368, + "355": 369, + "356": 370, + "357": 371, + "358": 372, + "359": 374, + "360": 375, + "361": 376, + "362": 377, + "363": 378, + "364": 379, + "365": 381, + "366": 382, + "367": 383, + "368": 384, + "369": 385, + "370": 386, + "371": 387, + "372": 388, + "373": 389, + "374": 390, + "375": 391, + "376": 392, + "377": 393, + "378": 394, + "379": 395, + "380": 396, + "381": 397, + "382": 398, + "383": 399, + "384": 400, + "385": 401, + "386": 402, + "387": 403, + "388": 404, + "389": 405, + "390": 406, + "391": 407, + "392": 408, + "393": 409, + "394": 410, + "395": 412, + "396": 413, + "397": 414, + "398": 415, + "399": 416, + "400": 418, + "401": 420, + "402": 421, + "403": 422, + "404": 423, + "405": 424, + "406": 425, + "407": 426, + "408": 427, + "409": 428, + "410": 429, + "411": 430, + "412": 431, + "413": 432, + "414": 433, + "415": 434, + "416": 435, + "417": 436, + "418": 437, + "419": 438, + "420": 439, + "421": 440, + "422": 441, + "423": 442, + "424": 443, + "425": 444, + "426": 445, + "427": 446, + "428": 447, + "429": 448, + "430": 449, + "431": 450, + "432": 451, + "433": 452, + "434": 453, + "435": 454, + "436": 455, + "437": 456, + "438": 457, + "439": 458, + "440": 459, + "441": 461, + "442": 463, + "443": 464, + "444": 465, + "445": 466, + "446": 467, + "447": 468, + "448": 469, + "449": 470, + "450": 471, + "451": 472, + "452": 473, + "453": 474, + "454": 475, + "455": 476, + "456": 477, + "457": 478, + "458": 479, + "459": 480, + "460": 481, + "461": 482, + "462": 483, + "463": 484, + "464": 485, + "465": 487, + "466": 488, + "467": 490, + "468": 492, + "469": 493, + "470": 494, + "471": 496, + "472": 497, + "473": 498, + "474": 499, + "475": 500, + "476": 501, + "477": 502, + "478": 503, + "479": 504, + "480": 505, + "481": 506, + "482": 507, + "483": 508, + "484": 509, + "485": 510, + "486": 512, + "487": 513, + "488": 514, + "489": 515, + "490": 516, + "491": 517, + "492": 518, + "493": 519, + "494": 520, + "495": 521, + "496": 522, + "497": 523, + "498": 524, + "499": 525, + "500": 526, + "501": 527, + "502": 528, + "503": 529, + "504": 530, + "505": 531, + "506": 532, + "507": 533, + "508": 534, + "509": 535, + "510": 536, + "511": 537, + "512": 538, + "513": 539, + "514": 540, + "515": 541, + "516": 542, + "517": 543, + "518": 544, + "519": 545, + "520": 546, + "521": 547, + "522": 548, + "523": 549, + "524": 550, + "525": 551, + "526": 552, + "527": 553, + "528": 554, + "529": 555, + "530": 556, + "531": 557, + "532": 559, + "533": 561, + "534": 562, + "535": 563, + "536": 564, + "537": 565, + "538": 566, + "539": 567, + "540": 568, + "541": 569, + "542": 570, + "543": 571, + "544": 572, + "545": 573, + "546": 574, + "547": 575, + "548": 576, + "549": 577, + "550": 578, + "551": 579, + "552": 580, + "553": 581, + "554": 582, + "555": 583, + "556": 584, + "557": 585, + "558": 586, + "559": 587, + "560": 588, + "561": 589, + "562": 590, + "563": 591, + "564": 592, + "565": 593, + "566": 594, + "567": 595, + "568": 596, + "569": 597, + "570": 598, + "571": 599, + "572": 600, + "573": 601, + "574": 602, + "575": 603, + "576": 604, + "577": 605, + "578": 606, + "579": 607, + "580": 608, + "581": 609, + "582": 610, + "583": 611, + "584": 612, + "585": 613, + "586": 614, + "587": 615, + "588": 616, + "589": 618, + "590": 620, + "591": 621, + "592": 622, + "593": 623, + "594": 624, + "595": 625, + "596": 626, + "597": 627, + "598": 628, + "599": 629, + "600": 630, + "601": 631, + "602": 632, + "603": 633, + "604": 634, + "605": 635, + "606": 636, + "607": 637, + "608": 639, + "609": 640, + "610": 641, + "611": 643, + "612": 645, + "613": 647, + "614": 649, + "615": 651, + "616": 653, + "617": 654, + "618": 655, + "619": 656, + "620": 657, + "621": 658, + "622": 659, + "623": 660, + "624": 661, + "625": 662, + "626": 663, + "627": 664, + "628": 665, + "629": 666, + "630": 667, + "631": 668, + "632": 670, + "633": 671, + "634": 672, + "635": 673, + "636": 674, + "637": 675, + "638": 676, + "639": 677, + "640": 678, + "641": 679, + "642": 680, + "643": 681, + "644": 682, + "645": 683, + "646": 684, + "647": 685, + "648": 686, + "649": 687, + "650": 688, + "651": 689, + "652": 690, + "653": 691, + "654": 692, + "655": 693, + "656": 694, + "657": 695, + "658": 696, + "659": 697, + "660": 698, + "661": 699, + "662": 700, + "663": 702, + "664": 703, + "665": 705, + "666": 707, + "667": 708, + "668": 709, + "669": 710, + "670": 711, + "671": 712, + "672": 713, + "673": 714, + "674": 715, + "675": 716, + "676": 717, + "677": 718, + "678": 719, + "679": 720, + "680": 721, + "681": 722, + "682": 723, + "683": 724, + "684": 725, + "685": 726, + "686": 727, + "687": 728, + "688": 729, + "689": 730, + "690": 732, + "691": 734, + "692": 735, + "693": 736, + "694": 738, + "695": 739, + "696": 740, + "697": 741, + "698": 742, + "699": 743, + "700": 744, + "701": 745, + "702": 746, + "703": 747, + "704": 748, + "705": 749, + "706": 750, + "707": 751, + "708": 752, + "709": 753, + "710": 754, + "711": 755, + "712": 756, + "713": 757, + "714": 758, + "715": 759, + "716": 760, + "717": 761, + "718": 762, + "719": 763, + "720": 764, + "721": 765, + "722": 766, + "723": 767, + "724": 768, + "725": 769, + "726": 770, + "727": 771, + "728": 772, + "729": 773, + "730": 774, + "731": 775, + "732": 776, + "733": 777, + "734": 779, + "735": 781, + "736": 782, + "737": 783, + "738": 784, + "739": 785, + "740": 786, + "741": 787, + "742": 788, + "743": 789, + "744": 790, + "745": 791, + "746": 792, + "747": 793, + "748": 794, + "749": 795, + "750": 796, + "751": 797, + "752": 798, + "753": 799, + "754": 800, + "755": 801, + "756": 802, + "757": 803, + "758": 804, + "759": 805, + "760": 806, + "761": 807, + "762": 808, + "763": 809, + "764": 810, + "765": 811, + "766": 812, + "767": 813, + "768": 814, + "769": 815, + "770": 816, + "771": 817, + "772": 818, + "773": 819, + "774": 820, + "775": 821, + "776": 822, + "777": 823, + "778": 824, + "779": 825, + "780": 826, + "781": 827, + "782": 828, + "783": 829, + "784": 830, + "785": 831, + "786": 832, + "787": 833, + "788": 834, + "789": 835, + "790": 836, + "791": 837, + "792": 839, + "793": 840, + "794": 841, + "795": 842, + "796": 843, + "797": 844, + "798": 845, + "799": 846, + "800": 847, + "801": 848, + "802": 849, + "803": 850, + "804": 851, + "805": 852, + "806": 853, + "807": 854, + "808": 855, + "809": 856, + "810": 857, + "811": 858, + "812": 859, + "813": 860, + "814": 861, + "815": 862, + "816": 863, + "817": 864, + "818": 865, + "819": 867, + "820": 869, + "821": 870, + "822": 871, + "823": 872, + "824": 873, + "825": 874, + "826": 875, + "827": 876, + "828": 877, + "829": 878, + "830": 879, + "831": 880, + "832": 881, + "833": 882, + "834": 883, + "835": 884, + "836": 885, + "837": 886, + "838": 887, + "839": 888, + "840": 889, + "841": 890, + "842": 891, + "843": 892, + "844": 893, + "845": 894, + "846": 895, + "847": 896, + "848": 897, + "849": 898, + "850": 899, + "851": 900, + "852": 901, + "853": 902, + "854": 903, + "855": 904, + "856": 905, + "857": 906, + "858": 907, + "859": 908, + "860": 909, + "861": 910, + "862": 911, + "863": 912, + "864": 913, + "865": 914, + "866": 915, + "867": 916, + "868": 917, + "869": 918, + "870": 919, + "871": 920, + "872": 921, + "873": 922, + "874": 923, + "875": 924, + "876": 925, + "877": 926, + "878": 927, + "879": 928, + "880": 929, + "881": 930, + "882": 931, + "883": 932, + "884": 933, + "885": 934, + "886": 935, + "887": 936, + "888": 937, + "889": 938, + "890": 939, + "891": 940, + "892": 941, + "893": 942, + "894": 943, + "895": 944, + "896": 945, + "897": 946, + "898": 947, + "899": 948, + "900": 949, + "901": 950, + "902": 951, + "903": 952, + "904": 953, + "905": 954, + "906": 955, + "907": 956, + "908": 958, + "909": 959 +} \ No newline at end of file diff --git a/data/corpora/cg/debug/PMID-1388088.txt b/data/corpora/cg/debug/PMID-1388088.txt new file mode 100644 index 0000000..784355b --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.txt @@ -0,0 +1,5 @@ +Interleukin - 1 receptor antagonist inhibits ischaemic and excitotoxic neuronal damage in the rat . +Interleukin - 1 ( IL - 1 ) synthesis in the brain is stimulated by mechanical injury and IL - 1 mimics some effects of injury , such as gliosis and neovascularization . +We report that neuronal death resulting from focal cerebral ischaemia ( middle cerebral artery occlusion , 24 h ) is significantly inhibited ( by 50 % ) in rats injected with a recombinant IL - 1 receptor antagonist ( IL - 1ra , 10 micrograms , icv 30 min before and 10 min after ischaemia ) . +Excitotoxic damage due to striatal infusion of an NMDA - receptor agonist ( cis - 2 , 4 - methanoglutamate ) was also markedly inhibited ( 71 % ) by injection of the IL - 1ra . +These data indicate that endogenous IL - 1 is a mediator of ischaemic and excitotoxic brain damage , and that inhibitors of IL - 1 action may be of therapeutic value in the treatment of acute or chronic neuronal death . \ No newline at end of file diff --git a/data/corpora/cg/debug/PMID-1388088.txt.ori b/data/corpora/cg/debug/PMID-1388088.txt.ori new file mode 100644 index 0000000..e19d3f7 --- /dev/null +++ b/data/corpora/cg/debug/PMID-1388088.txt.ori @@ -0,0 +1,2 @@ +Interleukin-1 receptor antagonist inhibits ischaemic and excitotoxic neuronal damage in the rat. +Interleukin-1 (IL-1) synthesis in the brain is stimulated by mechanical injury and IL-1 mimics some effects of injury, such as gliosis and neovascularization. We report that neuronal death resulting from focal cerebral ischaemia (middle cerebral artery occlusion, 24 h) is significantly inhibited (by 50%) in rats injected with a recombinant IL-1 receptor antagonist (IL-1ra, 10 micrograms, icv 30 min before and 10 min after ischaemia). Excitotoxic damage due to striatal infusion of an NMDA-receptor agonist (cis-2,4-methanoglutamate) was also markedly inhibited (71%) by injection of the IL-1ra. These data indicate that endogenous IL-1 is a mediator of ischaemic and excitotoxic brain damage, and that inhibitors of IL-1 action may be of therapeutic value in the treatment of acute or chronic neuronal death. diff --git a/data/corpora/cg/debug/PMID-198130.a1 b/data/corpora/cg/debug/PMID-198130.a1 new file mode 100644 index 0000000..c33ed74 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.a1 @@ -0,0 +1,35 @@ +T1 Simple_chemical 13 73 malate - aspartate reduced nicotinamide adenine dinucleotide +T2 Cell 111 122 tumor cells +T3 Simple_chemical 155 158 CO2 +T4 Simple_chemical 163 170 lactate +T5 Simple_chemical 268 286 tricarboxylic acid +T6 Organism 320 326 rodent +T7 Cell 327 346 ascites tumor cells +T8 Cell 364 391 Ehrlich ascites tumor cells +T9 Cell 394 412 Krebs II carcinoma +T10 Cell 415 433 AS - 30D carcinoma +T11 Cell 440 451 L1210 cells +T12 Simple_chemical 513 532 D - [ 14C ] glucose +T13 Simple_chemical 612 645 nicotinamide adenine dinucleotide +T14 Simple_chemical 648 652 NADH +T15 Cell 672 677 cells +T16 Simple_chemical 737 743 malate +T17 Simple_chemical 746 755 aspartate +T18 Cell 910 920 cell lines +T19 Simple_chemical 982 1000 tricarboxylic acid +T20 Simple_chemical 1009 1017 electron +T21 Simple_chemical 1038 1044 malate +T22 Simple_chemical 1047 1056 aspartate +T23 Cancer 1074 1080 tumors +T24 Simple_chemical 1110 1114 NADH +T25 Cellular_component 1136 1149 mitochondrial +T26 Simple_chemical 1176 1180 NADH +T27 Simple_chemical 1278 1284 oxygen +T28 Cancer 1294 1300 tumors +T29 Simple_chemical 1345 1367 adenosine triphosphate +T30 Simple_chemical 1445 1467 adenosine triphosphate +T31 Simple_chemical 1484 1492 electron +T32 Cellular_component 1515 1524 cytosolic +T33 Simple_chemical 1525 1529 NADH +T34 Simple_chemical 1538 1544 malate +T35 Simple_chemical 1547 1556 aspartate diff --git a/data/corpora/cg/debug/PMID-198130.a2 b/data/corpora/cg/debug/PMID-198130.a2 new file mode 100644 index 0000000..c3ebf95 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.a2 @@ -0,0 +1,39 @@ +* Equiv T13 T14 +T36 Pathway 74 81 shuttle +T37 Synthesis 171 181 production +T38 Glycolysis 244 263 glycolytic sequence +T39 Pathway 287 292 cycle +T40 Planned_process 454 463 incubated +T41 Negative_regulation 604 611 reduced +T42 Pathway 655 662 shuttle +T43 Positive_regulation 678 686 requires +T44 Pathway 756 763 shuttle +T45 Glycolysis 965 975 glycolysis +T46 Pathway 1001 1006 cycle +T47 Localization 1018 1027 transport +T48 Pathway 1057 1064 shuttle +T49 Pathway 1181 1188 shuttle +T50 Synthesis 1368 1377 synthesis +T51 Synthesis 1471 1480 generated +T52 Pathway 1557 1564 shuttle +E1 Pathway:T36 Participant:T1 +E2 Synthesis:T37 Theme:T3 +E3 Synthesis:T37 Theme:T4 +E4 Glycolysis:T38 +E5 Pathway:T39 Theme:T5 +E6 Planned_process:T40 Instrument:T12 Theme:T8 +E7 Planned_process:T40 Instrument:T12 Theme:T9 +E8 Planned_process:T40 Instrument:T12 Theme:T10 +E9 Planned_process:T40 Instrument:T12 Theme:T11 +E10 Negative_regulation:T41 Theme:E11 +E11 Pathway:T42 Participant:T13 +E12 Positive_regulation:T43 Theme:E11 Cause:E13 +E13 Pathway:T44 Participant:T16 Participant2:T17 +E14 Glycolysis:T45 +E15 Pathway:T46 Participant:T19 +E16 Localization:T47 Theme:T20 +E17 Pathway:T48 Participant:T21 Participant2:T22 +E18 Pathway:T49 Participant:T26 +E19 Synthesis:T50 Theme:T29 +E20 Synthesis:T51 Theme:T30 +E21 Pathway:T52 Participant:T34 Participant2:T35 diff --git a/data/corpora/cg/debug/PMID-198130.ann b/data/corpora/cg/debug/PMID-198130.ann new file mode 100644 index 0000000..dff9359 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.ann @@ -0,0 +1,98 @@ +* Equiv T13 T14 +T1 Simple_chemical 13 73 malate - aspartate reduced nicotinamide adenine dinucleotide +T2 Cell 111 122 tumor cells +T3 Simple_chemical 155 158 CO2 +T4 Simple_chemical 163 170 lactate +T5 Simple_chemical 268 286 tricarboxylic acid +T6 Organism 320 326 rodent +T7 Cell 327 346 ascites tumor cells +T8 Cell 364 391 Ehrlich ascites tumor cells +T9 Cell 394 412 Krebs II carcinoma +T10 Cell 415 433 AS - 30D carcinoma +T11 Cell 440 451 L1210 cells +T12 Simple_chemical 513 532 D - [ 14C ] glucose +T13 Simple_chemical 612 645 nicotinamide adenine dinucleotide +T14 Simple_chemical 648 652 NADH +T15 Cell 672 677 cells +T16 Simple_chemical 737 743 malate +T17 Simple_chemical 746 755 aspartate +T18 Cell 910 920 cell lines +T19 Simple_chemical 982 1000 tricarboxylic acid +T20 Simple_chemical 1009 1017 electron +T21 Simple_chemical 1038 1044 malate +T22 Simple_chemical 1047 1056 aspartate +T23 Cancer 1074 1080 tumors +T24 Simple_chemical 1110 1114 NADH +T25 Cellular_component 1136 1149 mitochondrial +T26 Simple_chemical 1176 1180 NADH +T27 Simple_chemical 1278 1284 oxygen +T28 Cancer 1294 1300 tumors +T29 Simple_chemical 1345 1367 adenosine triphosphate +T30 Simple_chemical 1445 1467 adenosine triphosphate +T31 Simple_chemical 1484 1492 electron +T32 Cellular_component 1515 1524 cytosolic +T33 Simple_chemical 1525 1529 NADH +T34 Simple_chemical 1538 1544 malate +T35 Simple_chemical 1547 1556 aspartate +TR36 Pathway 74 81 shuttle +TR37 Synthesis 171 181 production +TR38 Glycolysis 244 263 glycolytic sequence +TR39 Pathway 287 292 cycle +TR40 Planned_process 454 463 incubated +TR41 Negative_regulation 604 611 reduced +TR42 Pathway 655 662 shuttle +TR43 Positive_regulation 678 686 requires +TR44 Pathway 756 763 shuttle +TR45 Glycolysis 965 975 glycolysis +TR46 Pathway 1001 1006 cycle +TR47 Localization 1018 1027 transport +TR48 Pathway 1057 1064 shuttle +TR49 Pathway 1181 1188 shuttle +TR50 Synthesis 1368 1377 synthesis +TR51 Synthesis 1471 1480 generated +TR52 Pathway 1557 1564 shuttle +R1 Participant Arg1:TR48 Arg2:T21 +R2 Theme Arg1:TR37 Arg2:T3 +R3 Theme Arg1:TR50 Arg2:T29 +R4 Participant Arg1:TR44 Arg2:T17 +R5 Theme Arg1:TR51 Arg2:T30 +R6 Theme Arg1:TR40 Arg2:T8 +R7 Participant Arg1:TR42 Arg2:T13 +R8 Participant Arg1:TR48 Arg2:T22 +R9 Participant Arg1:TR52 Arg2:T35 +R10 Theme Arg1:TR40 Arg2:T10 +R11 Cause Arg1:TR43 Arg2:TR44 +R12 Participant Arg1:TR44 Arg2:T16 +R13 Theme Arg1:TR43 Arg2:TR42 +R14 Instrument Arg1:TR40 Arg2:T12 +R15 Theme Arg1:TR40 Arg2:T11 +R16 Theme Arg1:TR47 Arg2:T20 +R17 Theme Arg1:TR39 Arg2:T5 +R18 Theme Arg1:TR40 Arg2:T9 +R19 Participant Arg1:TR36 Arg2:T1 +R20 Theme Arg1:TR37 Arg2:T4 +R21 Participant Arg1:TR52 Arg2:T34 +R22 Participant Arg1:TR49 Arg2:T26 +R23 Participant Arg1:TR46 Arg2:T19 +R24 Theme Arg1:TR41 Arg2:TR42 +E1 Pathway:TR36 Participant:T1 +E2 Synthesis:TR37 Theme:T3 +E3 Synthesis:TR37 Theme:T4 +E4 Glycolysis:TR38 +E5 Pathway:TR39 Theme:T5 +E6 Planned_process:TR40 Instrument:T12 Theme:T8 +E7 Planned_process:TR40 Instrument:T12 Theme:T9 +E8 Planned_process:TR40 Instrument:T12 Theme:T10 +E9 Planned_process:TR40 Instrument:T12 Theme:T11 +E10 Negative_regulation:TR41 Theme:E11 +E11 Pathway:TR42 Participant:T13 +E12 Positive_regulation:TR43 Theme:E11 Cause:E13 +E13 Pathway:TR44 Participant:T16 Participant2:T17 +E14 Glycolysis:TR45 +E15 Pathway:TR46 Participant:T19 +E16 Localization:TR47 Theme:T20 +E17 Pathway:TR48 Participant:T21 Participant2:T22 +E18 Pathway:TR49 Participant:T26 +E19 Synthesis:TR50 Theme:T29 +E20 Synthesis:TR51 Theme:T30 +E21 Pathway:TR52 Participant:T34 Participant2:T35 diff --git a/data/corpora/cg/debug/PMID-198130.inv.map b/data/corpora/cg/debug/PMID-198130.inv.map new file mode 100644 index 0000000..11bfef1 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.inv.map @@ -0,0 +1,1779 @@ +{ + "0": 0, + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + "10": 10, + "11": 11, + "12": 12, + "13": 13, + "14": 14, + "15": 15, + "16": 16, + "17": 17, + "18": 18, + "19": 19, + "20": 19, + "21": 20, + "22": 20, + "23": 21, + "24": 22, + "25": 23, + "26": 24, + "27": 25, + "28": 26, + "29": 27, + "30": 28, + "31": 29, + "32": 30, + "33": 31, + "34": 32, + "35": 33, + "36": 34, + "37": 35, + "38": 36, + "39": 37, + "40": 38, + "41": 39, + "42": 40, + "43": 41, + "44": 42, + "45": 43, + "46": 44, + "47": 45, + "48": 46, + "49": 47, + "50": 48, + "51": 49, + "52": 50, + "53": 51, + "54": 52, + "55": 53, + "56": 54, + "57": 55, + "58": 56, + "59": 57, + "60": 58, + "61": 59, + "62": 60, + "63": 61, + "64": 62, + "65": 63, + "66": 64, + "67": 65, + "68": 66, + "69": 67, + "70": 68, + "71": 69, + "72": 70, + "73": 71, + "74": 72, + "75": 73, + "76": 74, + "77": 75, + "78": 76, + "79": 77, + "80": 78, + "81": 79, + "82": 80, + "83": 81, + "84": 82, + "85": 83, + "86": 84, + "87": 85, + "88": 86, + "89": 87, + "90": 88, + "91": 89, + "92": 90, + "93": 91, + "94": 92, + "95": 93, + "96": 94, + "97": 95, + "98": 96, + "99": 97, + "100": 98, + "101": 99, + "102": 100, + "103": 101, + "104": 102, + "105": 103, + "106": 104, + "107": 105, + "108": 106, + "109": 107, + "110": 108, + "111": 109, + "112": 110, + "113": 111, + "114": 112, + "115": 113, + "116": 114, + "117": 115, + "118": 116, + "119": 117, + "120": 118, + "121": 119, + "122": 120, + "123": 120, + "124": 121, + "125": 122, + "126": 123, + "127": 124, + "128": 125, + "129": 126, + "130": 127, + "131": 128, + "132": 129, + "133": 130, + "134": 131, + "135": 132, + "136": 133, + "137": 134, + "138": 135, + "139": 136, + "140": 137, + "141": 138, + "142": 139, + "143": 140, + "144": 141, + "145": 142, + "146": 143, + "147": 144, + "148": 145, + "149": 146, + "150": 147, + "151": 148, + "152": 149, + "153": 149, + "154": 150, + "155": 151, + "156": 152, + "157": 153, + "158": 154, + "159": 155, + "160": 156, + "161": 157, + "162": 158, + "163": 159, + "164": 160, + "165": 161, + "166": 162, + "167": 163, + "168": 164, + "169": 165, + "170": 166, + "171": 167, + "172": 168, + "173": 169, + "174": 170, + "175": 171, + "176": 172, + "177": 173, + "178": 174, + "179": 175, + "180": 176, + "181": 177, + "182": 177, + "183": 178, + "184": 179, + "185": 180, + "186": 181, + "187": 182, + "188": 183, + "189": 184, + "190": 185, + "191": 186, + "192": 187, + "193": 188, + "194": 189, + "195": 190, + "196": 191, + "197": 192, + "198": 193, + "199": 194, + "200": 195, + "201": 196, + "202": 197, + "203": 198, + "204": 199, + "205": 200, + "206": 201, + "207": 202, + "208": 203, + "209": 204, + "210": 205, + "211": 206, + "212": 207, + "213": 208, + "214": 209, + "215": 210, + "216": 211, + "217": 212, + "218": 213, + "219": 214, + "220": 215, + "221": 216, + "222": 217, + "223": 218, + "224": 219, + "225": 220, + "226": 221, + "227": 222, + "228": 223, + "229": 224, + "230": 225, + "231": 226, + "232": 227, + "233": 228, + "234": 229, + "235": 230, + "236": 231, + "237": 232, + "238": 233, + "239": 234, + "240": 235, + "241": 236, + "242": 237, + "243": 238, + "244": 239, + "245": 240, + "246": 241, + "247": 242, + "248": 243, + "249": 244, + "250": 245, + "251": 246, + "252": 247, + "253": 248, + "254": 249, + "255": 250, + "256": 251, + "257": 252, + "258": 253, + "259": 254, + "260": 255, + "261": 256, + "262": 257, + "263": 258, + "264": 259, + "265": 260, + "266": 261, + "267": 262, + "268": 263, + "269": 264, + "270": 265, + "271": 266, + "272": 267, + "273": 268, + "274": 269, + "275": 270, + "276": 271, + "277": 272, + "278": 273, + "279": 274, + "280": 275, + "281": 276, + "282": 277, + "283": 278, + "284": 279, + "285": 280, + "286": 281, + "287": 282, + "288": 283, + "289": 284, + "290": 285, + "291": 286, + "292": 287, + "293": 288, + "294": 289, + "295": 290, + "296": 291, + "297": 292, + "298": 293, + "299": 294, + "300": 295, + "301": 296, + "302": 297, + "303": 298, + "304": 299, + "305": 300, + "306": 301, + "307": 302, + "308": 303, + "309": 304, + "310": 305, + "311": 306, + "312": 307, + "313": 308, + "314": 309, + "315": 310, + "316": 311, + "317": 312, + "318": 313, + "319": 314, + "320": 315, + "321": 316, + "322": 317, + "323": 318, + "324": 319, + "325": 320, + "326": 321, + "327": 322, + "328": 323, + "329": 324, + "330": 325, + "331": 326, + "332": 327, + "333": 328, + "334": 329, + "335": 330, + "336": 331, + "337": 332, + "338": 333, + "339": 334, + "340": 335, + "341": 336, + "342": 337, + "343": 338, + "344": 339, + "345": 340, + "346": 341, + "347": 342, + "348": 343, + "349": 343, + "350": 344, + "351": 345, + "352": 346, + "353": 347, + "354": 348, + "355": 349, + "356": 350, + "357": 351, + "358": 352, + "359": 353, + "360": 354, + "361": 355, + "362": 356, + "363": 357, + "364": 358, + "365": 359, + "366": 360, + "367": 361, + "368": 362, + "369": 363, + "370": 364, + "371": 365, + "372": 366, + "373": 367, + "374": 368, + "375": 369, + "376": 370, + "377": 371, + "378": 372, + "379": 373, + "380": 374, + "381": 375, + "382": 376, + "383": 377, + "384": 378, + "385": 379, + "386": 380, + "387": 381, + "388": 382, + "389": 383, + "390": 384, + "391": 385, + "392": 385, + "393": 386, + "394": 387, + "395": 388, + "396": 389, + "397": 390, + "398": 391, + "399": 392, + "400": 393, + "401": 394, + "402": 395, + "403": 396, + "404": 397, + "405": 398, + "406": 399, + "407": 400, + "408": 401, + "409": 402, + "410": 403, + "411": 404, + "412": 405, + "413": 405, + "414": 406, + "415": 407, + "416": 408, + "417": 409, + "418": 409, + "419": 410, + "420": 410, + "421": 411, + "422": 412, + "423": 413, + "424": 414, + "425": 415, + "426": 416, + "427": 417, + "428": 418, + "429": 419, + "430": 420, + "431": 421, + "432": 422, + "433": 423, + "434": 423, + "435": 424, + "436": 425, + "437": 426, + "438": 427, + "439": 428, + "440": 429, + "441": 430, + "442": 431, + "443": 432, + "444": 433, + "445": 434, + "446": 435, + "447": 436, + "448": 437, + "449": 438, + "450": 439, + "451": 440, + "452": 440, + "453": 441, + "454": 442, + "455": 443, + "456": 444, + "457": 445, + "458": 446, + "459": 447, + "460": 448, + "461": 449, + "462": 450, + "463": 451, + "464": 452, + "465": 453, + "466": 454, + "467": 455, + "468": 456, + "469": 457, + "470": 458, + "471": 459, + "472": 460, + "473": 461, + "474": 462, + "475": 463, + "476": 464, + "477": 465, + "478": 466, + "479": 467, + "480": 468, + "481": 469, + "482": 470, + "483": 471, + "484": 472, + "485": 473, + "486": 474, + "487": 475, + "488": 476, + "489": 477, + "490": 478, + "491": 479, + "492": 480, + "493": 481, + "494": 482, + "495": 483, + "496": 484, + "497": 485, + "498": 486, + "499": 487, + "500": 488, + "501": 489, + "502": 490, + "503": 491, + "504": 492, + "505": 493, + "506": 494, + "507": 495, + "508": 496, + "509": 497, + "510": 498, + "511": 499, + "512": 500, + "513": 501, + "514": 502, + "515": 502, + "516": 503, + "517": 503, + "518": 504, + "519": 504, + "520": 505, + "521": 506, + "522": 507, + "523": 507, + "524": 508, + "525": 508, + "526": 509, + "527": 510, + "528": 511, + "529": 512, + "530": 513, + "531": 514, + "532": 515, + "533": 515, + "534": 516, + "535": 517, + "536": 518, + "537": 519, + "538": 520, + "539": 521, + "540": 522, + "541": 523, + "542": 524, + "543": 525, + "544": 526, + "545": 527, + "546": 528, + "547": 529, + "548": 530, + "549": 531, + "550": 532, + "551": 532, + "552": 533, + "553": 534, + "554": 535, + "555": 536, + "556": 537, + "557": 538, + "558": 539, + "559": 540, + "560": 541, + "561": 542, + "562": 543, + "563": 544, + "564": 545, + "565": 546, + "566": 547, + "567": 548, + "568": 549, + "569": 550, + "570": 551, + "571": 552, + "572": 553, + "573": 554, + "574": 555, + "575": 556, + "576": 557, + "577": 558, + "578": 559, + "579": 560, + "580": 561, + "581": 562, + "582": 563, + "583": 564, + "584": 565, + "585": 566, + "586": 567, + "587": 568, + "588": 569, + "589": 570, + "590": 571, + "591": 572, + "592": 573, + "593": 574, + "594": 575, + "595": 576, + "596": 577, + "597": 578, + "598": 579, + "599": 580, + "600": 581, + "601": 582, + "602": 583, + "603": 584, + "604": 585, + "605": 586, + "606": 587, + "607": 588, + "608": 589, + "609": 590, + "610": 591, + "611": 592, + "612": 593, + "613": 594, + "614": 595, + "615": 596, + "616": 597, + "617": 598, + "618": 599, + "619": 600, + "620": 601, + "621": 602, + "622": 603, + "623": 604, + "624": 605, + "625": 606, + "626": 607, + "627": 608, + "628": 609, + "629": 610, + "630": 611, + "631": 612, + "632": 613, + "633": 614, + "634": 615, + "635": 616, + "636": 617, + "637": 618, + "638": 619, + "639": 620, + "640": 621, + "641": 622, + "642": 623, + "643": 624, + "644": 625, + "645": 626, + "646": 627, + "647": 628, + "648": 628, + "649": 629, + "650": 630, + "651": 631, + "652": 632, + "653": 632, + "654": 633, + "655": 634, + "656": 635, + "657": 636, + "658": 637, + "659": 638, + "660": 639, + "661": 640, + "662": 641, + "663": 642, + "664": 643, + "665": 644, + "666": 645, + "667": 646, + "668": 647, + "669": 648, + "670": 649, + "671": 650, + "672": 651, + "673": 652, + "674": 653, + "675": 654, + "676": 655, + "677": 656, + "678": 657, + "679": 658, + "680": 659, + "681": 660, + "682": 661, + "683": 662, + "684": 663, + "685": 664, + "686": 665, + "687": 666, + "688": 667, + "689": 668, + "690": 669, + "691": 670, + "692": 671, + "693": 672, + "694": 673, + "695": 674, + "696": 675, + "697": 676, + "698": 677, + "699": 678, + "700": 679, + "701": 680, + "702": 681, + "703": 682, + "704": 683, + "705": 684, + "706": 685, + "707": 686, + "708": 687, + "709": 688, + "710": 689, + "711": 690, + "712": 691, + "713": 692, + "714": 693, + "715": 694, + "716": 695, + "717": 696, + "718": 697, + "719": 698, + "720": 699, + "721": 700, + "722": 701, + "723": 702, + "724": 703, + "725": 704, + "726": 705, + "727": 706, + "728": 707, + "729": 708, + "730": 709, + "731": 710, + "732": 711, + "733": 712, + "734": 713, + "735": 714, + "736": 715, + "737": 716, + "738": 717, + "739": 718, + "740": 719, + "741": 720, + "742": 721, + "743": 722, + "744": 722, + "745": 723, + "746": 723, + "747": 724, + "748": 725, + "749": 726, + "750": 727, + "751": 728, + "752": 729, + "753": 730, + "754": 731, + "755": 732, + "756": 733, + "757": 734, + "758": 735, + "759": 736, + "760": 737, + "761": 738, + "762": 739, + "763": 740, + "764": 741, + "765": 742, + "766": 742, + "767": 743, + "768": 743, + "769": 744, + "770": 744, + "771": 745, + "772": 745, + "773": 746, + "774": 746, + "775": 747, + "776": 747, + "777": 748, + "778": 749, + "779": 750, + "780": 751, + "781": 752, + "782": 753, + "783": 754, + "784": 755, + "785": 756, + "786": 757, + "787": 758, + "788": 759, + "789": 760, + "790": 761, + "791": 762, + "792": 763, + "793": 764, + "794": 765, + "795": 765, + "796": 766, + "797": 766, + "798": 767, + "799": 767, + "800": 768, + "801": 769, + "802": 770, + "803": 771, + "804": 772, + "805": 773, + "806": 774, + "807": 775, + "808": 776, + "809": 777, + "810": 778, + "811": 778, + "812": 779, + "813": 780, + "814": 781, + "815": 782, + "816": 783, + "817": 784, + "818": 785, + "819": 786, + "820": 787, + "821": 788, + "822": 789, + "823": 790, + "824": 790, + "825": 791, + "826": 791, + "827": 792, + "828": 793, + "829": 794, + "830": 795, + "831": 795, + "832": 796, + "833": 797, + "834": 798, + "835": 799, + "836": 800, + "837": 801, + "838": 801, + "839": 802, + "840": 802, + "841": 803, + "842": 804, + "843": 805, + "844": 806, + "845": 806, + "846": 807, + "847": 808, + "848": 809, + "849": 810, + "850": 811, + "851": 812, + "852": 812, + "853": 813, + "854": 813, + "855": 814, + "856": 815, + "857": 816, + "858": 817, + "859": 818, + "860": 819, + "861": 820, + "862": 821, + "863": 822, + "864": 823, + "865": 824, + "866": 825, + "867": 826, + "868": 827, + "869": 828, + "870": 829, + "871": 830, + "872": 831, + "873": 832, + "874": 833, + "875": 834, + "876": 835, + "877": 836, + "878": 837, + "879": 838, + "880": 839, + "881": 840, + "882": 841, + "883": 842, + "884": 843, + "885": 844, + "886": 845, + "887": 846, + "888": 847, + "889": 848, + "890": 849, + "891": 850, + "892": 851, + "893": 852, + "894": 853, + "895": 854, + "896": 855, + "897": 856, + "898": 857, + "899": 858, + "900": 859, + "901": 860, + "902": 861, + "903": 862, + "904": 863, + "905": 864, + "906": 865, + "907": 866, + "908": 867, + "909": 868, + "910": 869, + "911": 870, + "912": 871, + "913": 872, + "914": 873, + "915": 874, + "916": 875, + "917": 876, + "918": 877, + "919": 878, + "920": 879, + "921": 879, + "922": 880, + "923": 881, + "924": 882, + "925": 883, + "926": 884, + "927": 885, + "928": 886, + "929": 887, + "930": 888, + "931": 889, + "932": 890, + "933": 891, + "934": 892, + "935": 893, + "936": 894, + "937": 895, + "938": 896, + "939": 897, + "940": 898, + "941": 899, + "942": 900, + "943": 901, + "944": 902, + "945": 903, + "946": 904, + "947": 905, + "948": 906, + "949": 907, + "950": 908, + "951": 909, + "952": 910, + "953": 911, + "954": 912, + "955": 913, + "956": 914, + "957": 915, + "958": 916, + "959": 917, + "960": 918, + "961": 919, + "962": 920, + "963": 921, + "964": 922, + "965": 923, + "966": 924, + "967": 925, + "968": 926, + "969": 927, + "970": 928, + "971": 929, + "972": 930, + "973": 931, + "974": 932, + "975": 933, + "976": 933, + "977": 934, + "978": 935, + "979": 936, + "980": 937, + "981": 938, + "982": 939, + "983": 940, + "984": 941, + "985": 942, + "986": 943, + "987": 944, + "988": 945, + "989": 946, + "990": 947, + "991": 948, + "992": 949, + "993": 950, + "994": 951, + "995": 952, + "996": 953, + "997": 954, + "998": 955, + "999": 956, + "1000": 957, + "1001": 958, + "1002": 959, + "1003": 960, + "1004": 961, + "1005": 962, + "1006": 963, + "1007": 963, + "1008": 964, + "1009": 965, + "1010": 966, + "1011": 967, + "1012": 968, + "1013": 969, + "1014": 970, + "1015": 971, + "1016": 972, + "1017": 973, + "1018": 974, + "1019": 975, + "1020": 976, + "1021": 977, + "1022": 978, + "1023": 979, + "1024": 980, + "1025": 981, + "1026": 982, + "1027": 983, + "1028": 983, + "1029": 984, + "1030": 985, + "1031": 986, + "1032": 987, + "1033": 988, + "1034": 989, + "1035": 990, + "1036": 991, + "1037": 992, + "1038": 993, + "1039": 994, + "1040": 995, + "1041": 996, + "1042": 997, + "1043": 998, + "1044": 999, + "1045": 999, + "1046": 1000, + "1047": 1000, + "1048": 1001, + "1049": 1002, + "1050": 1003, + "1051": 1004, + "1052": 1005, + "1053": 1006, + "1054": 1007, + "1055": 1008, + "1056": 1009, + "1057": 1010, + "1058": 1011, + "1059": 1012, + "1060": 1013, + "1061": 1014, + "1062": 1015, + "1063": 1016, + "1064": 1017, + "1065": 1018, + "1066": 1019, + "1067": 1020, + "1068": 1021, + "1069": 1022, + "1070": 1023, + "1071": 1024, + "1072": 1025, + "1073": 1026, + "1074": 1027, + "1075": 1028, + "1076": 1029, + "1077": 1030, + "1078": 1031, + "1079": 1032, + "1080": 1033, + "1081": 1033, + "1082": 1034, + "1083": 1035, + "1084": 1036, + "1085": 1037, + "1086": 1038, + "1087": 1039, + "1088": 1040, + "1089": 1041, + "1090": 1042, + "1091": 1043, + "1092": 1044, + "1093": 1045, + "1094": 1046, + "1095": 1047, + "1096": 1048, + "1097": 1049, + "1098": 1050, + "1099": 1051, + "1100": 1052, + "1101": 1053, + "1102": 1054, + "1103": 1055, + "1104": 1056, + "1105": 1057, + "1106": 1058, + "1107": 1059, + "1108": 1060, + "1109": 1061, + "1110": 1062, + "1111": 1063, + "1112": 1064, + "1113": 1065, + "1114": 1066, + "1115": 1067, + "1116": 1068, + "1117": 1069, + "1118": 1070, + "1119": 1071, + "1120": 1072, + "1121": 1073, + "1122": 1074, + "1123": 1075, + "1124": 1076, + "1125": 1077, + "1126": 1078, + "1127": 1079, + "1128": 1080, + "1129": 1081, + "1130": 1082, + "1131": 1083, + "1132": 1084, + "1133": 1085, + "1134": 1086, + "1135": 1087, + "1136": 1088, + "1137": 1089, + "1138": 1090, + "1139": 1091, + "1140": 1092, + "1141": 1093, + "1142": 1094, + "1143": 1095, + "1144": 1096, + "1145": 1097, + "1146": 1098, + "1147": 1099, + "1148": 1100, + "1149": 1101, + "1150": 1102, + "1151": 1103, + "1152": 1104, + "1153": 1105, + "1154": 1106, + "1155": 1107, + "1156": 1108, + "1157": 1109, + "1158": 1110, + "1159": 1111, + "1160": 1112, + "1161": 1113, + "1162": 1114, + "1163": 1115, + "1164": 1116, + "1165": 1117, + "1166": 1118, + "1167": 1119, + "1168": 1120, + "1169": 1121, + "1170": 1122, + "1171": 1123, + "1172": 1124, + "1173": 1125, + "1174": 1126, + "1175": 1127, + "1176": 1128, + "1177": 1129, + "1178": 1130, + "1179": 1131, + "1180": 1132, + "1181": 1133, + "1182": 1134, + "1183": 1135, + "1184": 1136, + "1185": 1137, + "1186": 1138, + "1187": 1139, + "1188": 1140, + "1189": 1140, + "1190": 1141, + "1191": 1142, + "1192": 1143, + "1193": 1144, + "1194": 1145, + "1195": 1146, + "1196": 1147, + "1197": 1148, + "1198": 1149, + "1199": 1150, + "1200": 1151, + "1201": 1152, + "1202": 1153, + "1203": 1154, + "1204": 1155, + "1205": 1156, + "1206": 1157, + "1207": 1158, + "1208": 1159, + "1209": 1160, + "1210": 1161, + "1211": 1162, + "1212": 1163, + "1213": 1164, + "1214": 1165, + "1215": 1166, + "1216": 1167, + "1217": 1168, + "1218": 1169, + "1219": 1170, + "1220": 1171, + "1221": 1172, + "1222": 1173, + "1223": 1174, + "1224": 1175, + "1225": 1176, + "1226": 1177, + "1227": 1178, + "1228": 1179, + "1229": 1180, + "1230": 1181, + "1231": 1181, + "1232": 1182, + "1233": 1183, + "1234": 1184, + "1235": 1185, + "1236": 1186, + "1237": 1187, + "1238": 1188, + "1239": 1189, + "1240": 1190, + "1241": 1191, + "1242": 1192, + "1243": 1193, + "1244": 1194, + "1245": 1195, + "1246": 1196, + "1247": 1197, + "1248": 1198, + "1249": 1199, + "1250": 1200, + "1251": 1201, + "1252": 1202, + "1253": 1203, + "1254": 1204, + "1255": 1205, + "1256": 1206, + "1257": 1207, + "1258": 1208, + "1259": 1209, + "1260": 1210, + "1261": 1211, + "1262": 1212, + "1263": 1213, + "1264": 1214, + "1265": 1215, + "1266": 1216, + "1267": 1217, + "1268": 1218, + "1269": 1219, + "1270": 1220, + "1271": 1221, + "1272": 1222, + "1273": 1223, + "1274": 1224, + "1275": 1225, + "1276": 1226, + "1277": 1227, + "1278": 1228, + "1279": 1229, + "1280": 1230, + "1281": 1231, + "1282": 1232, + "1283": 1233, + "1284": 1234, + "1285": 1235, + "1286": 1236, + "1287": 1237, + "1288": 1238, + "1289": 1239, + "1290": 1240, + "1291": 1241, + "1292": 1242, + "1293": 1243, + "1294": 1244, + "1295": 1245, + "1296": 1246, + "1297": 1247, + "1298": 1248, + "1299": 1249, + "1300": 1250, + "1301": 1250, + "1302": 1251, + "1303": 1252, + "1304": 1253, + "1305": 1254, + "1306": 1255, + "1307": 1256, + "1308": 1257, + "1309": 1258, + "1310": 1259, + "1311": 1260, + "1312": 1261, + "1313": 1262, + "1314": 1263, + "1315": 1264, + "1316": 1265, + "1317": 1266, + "1318": 1267, + "1319": 1268, + "1320": 1269, + "1321": 1270, + "1322": 1271, + "1323": 1272, + "1324": 1273, + "1325": 1274, + "1326": 1275, + "1327": 1276, + "1328": 1277, + "1329": 1278, + "1330": 1279, + "1331": 1280, + "1332": 1281, + "1333": 1282, + "1334": 1283, + "1335": 1284, + "1336": 1285, + "1337": 1286, + "1338": 1287, + "1339": 1288, + "1340": 1289, + "1341": 1290, + "1342": 1291, + "1343": 1292, + "1344": 1293, + "1345": 1294, + "1346": 1295, + "1347": 1296, + "1348": 1297, + "1349": 1298, + "1350": 1299, + "1351": 1300, + "1352": 1301, + "1353": 1302, + "1354": 1303, + "1355": 1304, + "1356": 1305, + "1357": 1306, + "1358": 1307, + "1359": 1308, + "1360": 1309, + "1361": 1310, + "1362": 1311, + "1363": 1312, + "1364": 1313, + "1365": 1314, + "1366": 1315, + "1367": 1316, + "1368": 1317, + "1369": 1318, + "1370": 1319, + "1371": 1320, + "1372": 1321, + "1373": 1322, + "1374": 1323, + "1375": 1324, + "1376": 1325, + "1377": 1326, + "1378": 1327, + "1379": 1328, + "1380": 1329, + "1381": 1330, + "1382": 1331, + "1383": 1332, + "1384": 1333, + "1385": 1334, + "1386": 1335, + "1387": 1336, + "1388": 1337, + "1389": 1338, + "1390": 1339, + "1391": 1340, + "1392": 1341, + "1393": 1342, + "1394": 1343, + "1395": 1344, + "1396": 1345, + "1397": 1346, + "1398": 1347, + "1399": 1348, + "1400": 1349, + "1401": 1350, + "1402": 1351, + "1403": 1352, + "1404": 1353, + "1405": 1354, + "1406": 1355, + "1407": 1356, + "1408": 1357, + "1409": 1358, + "1410": 1359, + "1411": 1360, + "1412": 1361, + "1413": 1362, + "1414": 1363, + "1415": 1364, + "1416": 1365, + "1417": 1366, + "1418": 1366, + "1419": 1367, + "1420": 1367, + "1421": 1368, + "1422": 1369, + "1423": 1370, + "1424": 1371, + "1425": 1372, + "1426": 1373, + "1427": 1374, + "1428": 1375, + "1429": 1376, + "1430": 1377, + "1431": 1378, + "1432": 1379, + "1433": 1380, + "1434": 1381, + "1435": 1382, + "1436": 1383, + "1437": 1384, + "1438": 1385, + "1439": 1386, + "1440": 1387, + "1441": 1388, + "1442": 1389, + "1443": 1390, + "1444": 1391, + "1445": 1392, + "1446": 1393, + "1447": 1394, + "1448": 1395, + "1449": 1396, + "1450": 1397, + "1451": 1398, + "1452": 1399, + "1453": 1400, + "1454": 1401, + "1455": 1402, + "1456": 1403, + "1457": 1404, + "1458": 1405, + "1459": 1406, + "1460": 1407, + "1461": 1408, + "1462": 1409, + "1463": 1410, + "1464": 1411, + "1465": 1412, + "1466": 1413, + "1467": 1414, + "1468": 1415, + "1469": 1416, + "1470": 1417, + "1471": 1418, + "1472": 1419, + "1473": 1420, + "1474": 1421, + "1475": 1422, + "1476": 1423, + "1477": 1424, + "1478": 1425, + "1479": 1426, + "1480": 1427, + "1481": 1428, + "1482": 1429, + "1483": 1430, + "1484": 1431, + "1485": 1432, + "1486": 1433, + "1487": 1434, + "1488": 1435, + "1489": 1436, + "1490": 1437, + "1491": 1438, + "1492": 1439, + "1493": 1440, + "1494": 1441, + "1495": 1442, + "1496": 1443, + "1497": 1444, + "1498": 1445, + "1499": 1446, + "1500": 1447, + "1501": 1448, + "1502": 1449, + "1503": 1450, + "1504": 1451, + "1505": 1452, + "1506": 1453, + "1507": 1454, + "1508": 1455, + "1509": 1456, + "1510": 1457, + "1511": 1458, + "1512": 1459, + "1513": 1460, + "1514": 1461, + "1515": 1462, + "1516": 1463, + "1517": 1464, + "1518": 1465, + "1519": 1466, + "1520": 1467, + "1521": 1468, + "1522": 1469, + "1523": 1470, + "1524": 1471, + "1525": 1472, + "1526": 1473, + "1527": 1474, + "1528": 1475, + "1529": 1476, + "1530": 1477, + "1531": 1478, + "1532": 1479, + "1533": 1480, + "1534": 1481, + "1535": 1482, + "1536": 1483, + "1537": 1484, + "1538": 1485, + "1539": 1486, + "1540": 1487, + "1541": 1488, + "1542": 1489, + "1543": 1490, + "1544": 1491, + "1545": 1491, + "1546": 1492, + "1547": 1492, + "1548": 1493, + "1549": 1494, + "1550": 1495, + "1551": 1496, + "1552": 1497, + "1553": 1498, + "1554": 1499, + "1555": 1500, + "1556": 1501, + "1557": 1502, + "1558": 1503, + "1559": 1504, + "1560": 1505, + "1561": 1506, + "1562": 1507, + "1563": 1508, + "1564": 1509, + "1565": 1509, + "1566": 1510, + "entities": { + "T1": [ + 13, + 71 + ], + "T2": [ + 109, + 120 + ], + "T3": [ + 151, + 154 + ], + "T4": [ + 159, + 166 + ], + "T5": [ + 263, + 281 + ], + "T6": [ + 315, + 321 + ], + "T7": [ + 322, + 341 + ], + "T8": [ + 358, + 385 + ], + "T9": [ + 387, + 405 + ], + "T10": [ + 407, + 423 + ], + "T11": [ + 429, + 440 + ], + "T12": [ + 501, + 515 + ], + "T13": [ + 593, + 626 + ], + "T14": [ + 628, + 632 + ], + "T15": [ + 651, + 656 + ], + "T16": [ + 716, + 722 + ], + "T17": [ + 723, + 732 + ], + "T18": [ + 869, + 879 + ], + "T19": [ + 939, + 957 + ], + "T20": [ + 965, + 973 + ], + "T21": [ + 993, + 999 + ], + "T22": [ + 1000, + 1009 + ], + "T23": [ + 1027, + 1033 + ], + "T24": [ + 1062, + 1066 + ], + "T25": [ + 1088, + 1101 + ], + "T26": [ + 1128, + 1132 + ], + "T27": [ + 1228, + 1234 + ], + "T28": [ + 1244, + 1250 + ], + "T29": [ + 1294, + 1316 + ], + "T30": [ + 1392, + 1414 + ], + "T31": [ + 1431, + 1439 + ], + "T32": [ + 1462, + 1471 + ], + "T33": [ + 1472, + 1476 + ], + "T34": [ + 1485, + 1491 + ], + "T35": [ + 1492, + 1501 + ], + "T36": [ + 72, + 79 + ], + "T37": [ + 167, + 177 + ], + "T38": [ + 239, + 258 + ], + "T39": [ + 282, + 287 + ], + "T40": [ + 442, + 451 + ], + "T41": [ + 585, + 592 + ], + "T42": [ + 634, + 641 + ], + "T43": [ + 657, + 665 + ], + "T44": [ + 733, + 740 + ], + "T45": [ + 923, + 933 + ], + "T46": [ + 958, + 963 + ], + "T47": [ + 974, + 983 + ], + "T48": [ + 1010, + 1017 + ], + "T49": [ + 1133, + 1140 + ], + "T50": [ + 1317, + 1326 + ], + "T51": [ + 1418, + 1427 + ], + "T52": [ + 1502, + 1509 + ] + } +} \ No newline at end of file diff --git a/data/corpora/cg/debug/PMID-198130.map b/data/corpora/cg/debug/PMID-198130.map new file mode 100644 index 0000000..ef839d0 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.map @@ -0,0 +1,1513 @@ +{ + "0": 0, + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + "10": 10, + "11": 11, + "12": 12, + "13": 13, + "14": 14, + "15": 15, + "16": 16, + "17": 17, + "18": 18, + "19": 20, + "20": 22, + "21": 23, + "22": 24, + "23": 25, + "24": 26, + "25": 27, + "26": 28, + "27": 29, + "28": 30, + "29": 31, + "30": 32, + "31": 33, + "32": 34, + "33": 35, + "34": 36, + "35": 37, + "36": 38, + "37": 39, + "38": 40, + "39": 41, + "40": 42, + "41": 43, + "42": 44, + "43": 45, + "44": 46, + "45": 47, + "46": 48, + "47": 49, + "48": 50, + "49": 51, + "50": 52, + "51": 53, + "52": 54, + "53": 55, + "54": 56, + "55": 57, + "56": 58, + "57": 59, + "58": 60, + "59": 61, + "60": 62, + "61": 63, + "62": 64, + "63": 65, + "64": 66, + "65": 67, + "66": 68, + "67": 69, + "68": 70, + "69": 71, + "70": 72, + "71": 73, + "72": 74, + "73": 75, + "74": 76, + "75": 77, + "76": 78, + "77": 79, + "78": 80, + "79": 81, + "80": 82, + "81": 83, + "82": 84, + "83": 85, + "84": 86, + "85": 87, + "86": 88, + "87": 89, + "88": 90, + "89": 91, + "90": 92, + "91": 93, + "92": 94, + "93": 95, + "94": 96, + "95": 97, + "96": 98, + "97": 99, + "98": 100, + "99": 101, + "100": 102, + "101": 103, + "102": 104, + "103": 105, + "104": 106, + "105": 107, + "106": 108, + "107": 109, + "108": 110, + "109": 111, + "110": 112, + "111": 113, + "112": 114, + "113": 115, + "114": 116, + "115": 117, + "116": 118, + "117": 119, + "118": 120, + "119": 121, + "120": 123, + "121": 124, + "122": 125, + "123": 126, + "124": 127, + "125": 128, + "126": 129, + "127": 130, + "128": 131, + "129": 132, + "130": 133, + "131": 134, + "132": 135, + "133": 136, + "134": 137, + "135": 138, + "136": 139, + "137": 140, + "138": 141, + "139": 142, + "140": 143, + "141": 144, + "142": 145, + "143": 146, + "144": 147, + "145": 148, + "146": 149, + "147": 150, + "148": 151, + "149": 153, + "150": 154, + "151": 155, + "152": 156, + "153": 157, + "154": 158, + "155": 159, + "156": 160, + "157": 161, + "158": 162, + "159": 163, + "160": 164, + "161": 165, + "162": 166, + "163": 167, + "164": 168, + "165": 169, + "166": 170, + "167": 171, + "168": 172, + "169": 173, + "170": 174, + "171": 175, + "172": 176, + "173": 177, + "174": 178, + "175": 179, + "176": 180, + "177": 182, + "178": 183, + "179": 184, + "180": 185, + "181": 186, + "182": 187, + "183": 188, + "184": 189, + "185": 190, + "186": 191, + "187": 192, + "188": 193, + "189": 194, + "190": 195, + "191": 196, + "192": 197, + "193": 198, + "194": 199, + "195": 200, + "196": 201, + "197": 202, + "198": 203, + "199": 204, + "200": 205, + "201": 206, + "202": 207, + "203": 208, + "204": 209, + "205": 210, + "206": 211, + "207": 212, + "208": 213, + "209": 214, + "210": 215, + "211": 216, + "212": 217, + "213": 218, + "214": 219, + "215": 220, + "216": 221, + "217": 222, + "218": 223, + "219": 224, + "220": 225, + "221": 226, + "222": 227, + "223": 228, + "224": 229, + "225": 230, + "226": 231, + "227": 232, + "228": 233, + "229": 234, + "230": 235, + "231": 236, + "232": 237, + "233": 238, + "234": 239, + "235": 240, + "236": 241, + "237": 242, + "238": 243, + "239": 244, + "240": 245, + "241": 246, + "242": 247, + "243": 248, + "244": 249, + "245": 250, + "246": 251, + "247": 252, + "248": 253, + "249": 254, + "250": 255, + "251": 256, + "252": 257, + "253": 258, + "254": 259, + "255": 260, + "256": 261, + "257": 262, + "258": 263, + "259": 264, + "260": 265, + "261": 266, + "262": 267, + "263": 268, + "264": 269, + "265": 270, + "266": 271, + "267": 272, + "268": 273, + "269": 274, + "270": 275, + "271": 276, + "272": 277, + "273": 278, + "274": 279, + "275": 280, + "276": 281, + "277": 282, + "278": 283, + "279": 284, + "280": 285, + "281": 286, + "282": 287, + "283": 288, + "284": 289, + "285": 290, + "286": 291, + "287": 292, + "288": 293, + "289": 294, + "290": 295, + "291": 296, + "292": 297, + "293": 298, + "294": 299, + "295": 300, + "296": 301, + "297": 302, + "298": 303, + "299": 304, + "300": 305, + "301": 306, + "302": 307, + "303": 308, + "304": 309, + "305": 310, + "306": 311, + "307": 312, + "308": 313, + "309": 314, + "310": 315, + "311": 316, + "312": 317, + "313": 318, + "314": 319, + "315": 320, + "316": 321, + "317": 322, + "318": 323, + "319": 324, + "320": 325, + "321": 326, + "322": 327, + "323": 328, + "324": 329, + "325": 330, + "326": 331, + "327": 332, + "328": 333, + "329": 334, + "330": 335, + "331": 336, + "332": 337, + "333": 338, + "334": 339, + "335": 340, + "336": 341, + "337": 342, + "338": 343, + "339": 344, + "340": 345, + "341": 346, + "342": 347, + "343": 349, + "344": 350, + "345": 351, + "346": 352, + "347": 353, + "348": 354, + "349": 355, + "350": 356, + "351": 357, + "352": 358, + "353": 359, + "354": 360, + "355": 361, + "356": 362, + "357": 363, + "358": 364, + "359": 365, + "360": 366, + "361": 367, + "362": 368, + "363": 369, + "364": 370, + "365": 371, + "366": 372, + "367": 373, + "368": 374, + "369": 375, + "370": 376, + "371": 377, + "372": 378, + "373": 379, + "374": 380, + "375": 381, + "376": 382, + "377": 383, + "378": 384, + "379": 385, + "380": 386, + "381": 387, + "382": 388, + "383": 389, + "384": 390, + "385": 392, + "386": 393, + "387": 394, + "388": 395, + "389": 396, + "390": 397, + "391": 398, + "392": 399, + "393": 400, + "394": 401, + "395": 402, + "396": 403, + "397": 404, + "398": 405, + "399": 406, + "400": 407, + "401": 408, + "402": 409, + "403": 410, + "404": 411, + "405": 413, + "406": 414, + "407": 415, + "408": 416, + "409": 418, + "410": 420, + "411": 421, + "412": 422, + "413": 423, + "414": 424, + "415": 425, + "416": 426, + "417": 427, + "418": 428, + "419": 429, + "420": 430, + "421": 431, + "422": 432, + "423": 434, + "424": 435, + "425": 436, + "426": 437, + "427": 438, + "428": 439, + "429": 440, + "430": 441, + "431": 442, + "432": 443, + "433": 444, + "434": 445, + "435": 446, + "436": 447, + "437": 448, + "438": 449, + "439": 450, + "440": 452, + "441": 453, + "442": 454, + "443": 455, + "444": 456, + "445": 457, + "446": 458, + "447": 459, + "448": 460, + "449": 461, + "450": 462, + "451": 463, + "452": 464, + "453": 465, + "454": 466, + "455": 467, + "456": 468, + "457": 469, + "458": 470, + "459": 471, + "460": 472, + "461": 473, + "462": 474, + "463": 475, + "464": 476, + "465": 477, + "466": 478, + "467": 479, + "468": 480, + "469": 481, + "470": 482, + "471": 483, + "472": 484, + "473": 485, + "474": 486, + "475": 487, + "476": 488, + "477": 489, + "478": 490, + "479": 491, + "480": 492, + "481": 493, + "482": 494, + "483": 495, + "484": 496, + "485": 497, + "486": 498, + "487": 499, + "488": 500, + "489": 501, + "490": 502, + "491": 503, + "492": 504, + "493": 505, + "494": 506, + "495": 507, + "496": 508, + "497": 509, + "498": 510, + "499": 511, + "500": 512, + "501": 513, + "502": 515, + "503": 517, + "504": 519, + "505": 520, + "506": 521, + "507": 523, + "508": 525, + "509": 526, + "510": 527, + "511": 528, + "512": 529, + "513": 530, + "514": 531, + "515": 533, + "516": 534, + "517": 535, + "518": 536, + "519": 537, + "520": 538, + "521": 539, + "522": 540, + "523": 541, + "524": 542, + "525": 543, + "526": 544, + "527": 545, + "528": 546, + "529": 547, + "530": 548, + "531": 549, + "532": 551, + "533": 552, + "534": 553, + "535": 554, + "536": 555, + "537": 556, + "538": 557, + "539": 558, + "540": 559, + "541": 560, + "542": 561, + "543": 562, + "544": 563, + "545": 564, + "546": 565, + "547": 566, + "548": 567, + "549": 568, + "550": 569, + "551": 570, + "552": 571, + "553": 572, + "554": 573, + "555": 574, + "556": 575, + "557": 576, + "558": 577, + "559": 578, + "560": 579, + "561": 580, + "562": 581, + "563": 582, + "564": 583, + "565": 584, + "566": 585, + "567": 586, + "568": 587, + "569": 588, + "570": 589, + "571": 590, + "572": 591, + "573": 592, + "574": 593, + "575": 594, + "576": 595, + "577": 596, + "578": 597, + "579": 598, + "580": 599, + "581": 600, + "582": 601, + "583": 602, + "584": 603, + "585": 604, + "586": 605, + "587": 606, + "588": 607, + "589": 608, + "590": 609, + "591": 610, + "592": 611, + "593": 612, + "594": 613, + "595": 614, + "596": 615, + "597": 616, + "598": 617, + "599": 618, + "600": 619, + "601": 620, + "602": 621, + "603": 622, + "604": 623, + "605": 624, + "606": 625, + "607": 626, + "608": 627, + "609": 628, + "610": 629, + "611": 630, + "612": 631, + "613": 632, + "614": 633, + "615": 634, + "616": 635, + "617": 636, + "618": 637, + "619": 638, + "620": 639, + "621": 640, + "622": 641, + "623": 642, + "624": 643, + "625": 644, + "626": 645, + "627": 646, + "628": 648, + "629": 649, + "630": 650, + "631": 651, + "632": 653, + "633": 654, + "634": 655, + "635": 656, + "636": 657, + "637": 658, + "638": 659, + "639": 660, + "640": 661, + "641": 662, + "642": 663, + "643": 664, + "644": 665, + "645": 666, + "646": 667, + "647": 668, + "648": 669, + "649": 670, + "650": 671, + "651": 672, + "652": 673, + "653": 674, + "654": 675, + "655": 676, + "656": 677, + "657": 678, + "658": 679, + "659": 680, + "660": 681, + "661": 682, + "662": 683, + "663": 684, + "664": 685, + "665": 686, + "666": 687, + "667": 688, + "668": 689, + "669": 690, + "670": 691, + "671": 692, + "672": 693, + "673": 694, + "674": 695, + "675": 696, + "676": 697, + "677": 698, + "678": 699, + "679": 700, + "680": 701, + "681": 702, + "682": 703, + "683": 704, + "684": 705, + "685": 706, + "686": 707, + "687": 708, + "688": 709, + "689": 710, + "690": 711, + "691": 712, + "692": 713, + "693": 714, + "694": 715, + "695": 716, + "696": 717, + "697": 718, + "698": 719, + "699": 720, + "700": 721, + "701": 722, + "702": 723, + "703": 724, + "704": 725, + "705": 726, + "706": 727, + "707": 728, + "708": 729, + "709": 730, + "710": 731, + "711": 732, + "712": 733, + "713": 734, + "714": 735, + "715": 736, + "716": 737, + "717": 738, + "718": 739, + "719": 740, + "720": 741, + "721": 742, + "722": 744, + "723": 746, + "724": 747, + "725": 748, + "726": 749, + "727": 750, + "728": 751, + "729": 752, + "730": 753, + "731": 754, + "732": 755, + "733": 756, + "734": 757, + "735": 758, + "736": 759, + "737": 760, + "738": 761, + "739": 762, + "740": 763, + "741": 764, + "742": 766, + "743": 768, + "744": 770, + "745": 772, + "746": 774, + "747": 776, + "748": 777, + "749": 778, + "750": 779, + "751": 780, + "752": 781, + "753": 782, + "754": 783, + "755": 784, + "756": 785, + "757": 786, + "758": 787, + "759": 788, + "760": 789, + "761": 790, + "762": 791, + "763": 792, + "764": 793, + "765": 795, + "766": 797, + "767": 799, + "768": 800, + "769": 801, + "770": 802, + "771": 803, + "772": 804, + "773": 805, + "774": 806, + "775": 807, + "776": 808, + "777": 809, + "778": 811, + "779": 812, + "780": 813, + "781": 814, + "782": 815, + "783": 816, + "784": 817, + "785": 818, + "786": 819, + "787": 820, + "788": 821, + "789": 822, + "790": 824, + "791": 826, + "792": 827, + "793": 828, + "794": 829, + "795": 831, + "796": 832, + "797": 833, + "798": 834, + "799": 835, + "800": 836, + "801": 838, + "802": 840, + "803": 841, + "804": 842, + "805": 843, + "806": 845, + "807": 846, + "808": 847, + "809": 848, + "810": 849, + "811": 850, + "812": 852, + "813": 854, + "814": 855, + "815": 856, + "816": 857, + "817": 858, + "818": 859, + "819": 860, + "820": 861, + "821": 862, + "822": 863, + "823": 864, + "824": 865, + "825": 866, + "826": 867, + "827": 868, + "828": 869, + "829": 870, + "830": 871, + "831": 872, + "832": 873, + "833": 874, + "834": 875, + "835": 876, + "836": 877, + "837": 878, + "838": 879, + "839": 880, + "840": 881, + "841": 882, + "842": 883, + "843": 884, + "844": 885, + "845": 886, + "846": 887, + "847": 888, + "848": 889, + "849": 890, + "850": 891, + "851": 892, + "852": 893, + "853": 894, + "854": 895, + "855": 896, + "856": 897, + "857": 898, + "858": 899, + "859": 900, + "860": 901, + "861": 902, + "862": 903, + "863": 904, + "864": 905, + "865": 906, + "866": 907, + "867": 908, + "868": 909, + "869": 910, + "870": 911, + "871": 912, + "872": 913, + "873": 914, + "874": 915, + "875": 916, + "876": 917, + "877": 918, + "878": 919, + "879": 921, + "880": 922, + "881": 923, + "882": 924, + "883": 925, + "884": 926, + "885": 927, + "886": 928, + "887": 929, + "888": 930, + "889": 931, + "890": 932, + "891": 933, + "892": 934, + "893": 935, + "894": 936, + "895": 937, + "896": 938, + "897": 939, + "898": 940, + "899": 941, + "900": 942, + "901": 943, + "902": 944, + "903": 945, + "904": 946, + "905": 947, + "906": 948, + "907": 949, + "908": 950, + "909": 951, + "910": 952, + "911": 953, + "912": 954, + "913": 955, + "914": 956, + "915": 957, + "916": 958, + "917": 959, + "918": 960, + "919": 961, + "920": 962, + "921": 963, + "922": 964, + "923": 965, + "924": 966, + "925": 967, + "926": 968, + "927": 969, + "928": 970, + "929": 971, + "930": 972, + "931": 973, + "932": 974, + "933": 976, + "934": 977, + "935": 978, + "936": 979, + "937": 980, + "938": 981, + "939": 982, + "940": 983, + "941": 984, + "942": 985, + "943": 986, + "944": 987, + "945": 988, + "946": 989, + "947": 990, + "948": 991, + "949": 992, + "950": 993, + "951": 994, + "952": 995, + "953": 996, + "954": 997, + "955": 998, + "956": 999, + "957": 1000, + "958": 1001, + "959": 1002, + "960": 1003, + "961": 1004, + "962": 1005, + "963": 1007, + "964": 1008, + "965": 1009, + "966": 1010, + "967": 1011, + "968": 1012, + "969": 1013, + "970": 1014, + "971": 1015, + "972": 1016, + "973": 1017, + "974": 1018, + "975": 1019, + "976": 1020, + "977": 1021, + "978": 1022, + "979": 1023, + "980": 1024, + "981": 1025, + "982": 1026, + "983": 1028, + "984": 1029, + "985": 1030, + "986": 1031, + "987": 1032, + "988": 1033, + "989": 1034, + "990": 1035, + "991": 1036, + "992": 1037, + "993": 1038, + "994": 1039, + "995": 1040, + "996": 1041, + "997": 1042, + "998": 1043, + "999": 1045, + "1000": 1047, + "1001": 1048, + "1002": 1049, + "1003": 1050, + "1004": 1051, + "1005": 1052, + "1006": 1053, + "1007": 1054, + "1008": 1055, + "1009": 1056, + "1010": 1057, + "1011": 1058, + "1012": 1059, + "1013": 1060, + "1014": 1061, + "1015": 1062, + "1016": 1063, + "1017": 1064, + "1018": 1065, + "1019": 1066, + "1020": 1067, + "1021": 1068, + "1022": 1069, + "1023": 1070, + "1024": 1071, + "1025": 1072, + "1026": 1073, + "1027": 1074, + "1028": 1075, + "1029": 1076, + "1030": 1077, + "1031": 1078, + "1032": 1079, + "1033": 1081, + "1034": 1082, + "1035": 1083, + "1036": 1084, + "1037": 1085, + "1038": 1086, + "1039": 1087, + "1040": 1088, + "1041": 1089, + "1042": 1090, + "1043": 1091, + "1044": 1092, + "1045": 1093, + "1046": 1094, + "1047": 1095, + "1048": 1096, + "1049": 1097, + "1050": 1098, + "1051": 1099, + "1052": 1100, + "1053": 1101, + "1054": 1102, + "1055": 1103, + "1056": 1104, + "1057": 1105, + "1058": 1106, + "1059": 1107, + "1060": 1108, + "1061": 1109, + "1062": 1110, + "1063": 1111, + "1064": 1112, + "1065": 1113, + "1066": 1114, + "1067": 1115, + "1068": 1116, + "1069": 1117, + "1070": 1118, + "1071": 1119, + "1072": 1120, + "1073": 1121, + "1074": 1122, + "1075": 1123, + "1076": 1124, + "1077": 1125, + "1078": 1126, + "1079": 1127, + "1080": 1128, + "1081": 1129, + "1082": 1130, + "1083": 1131, + "1084": 1132, + "1085": 1133, + "1086": 1134, + "1087": 1135, + "1088": 1136, + "1089": 1137, + "1090": 1138, + "1091": 1139, + "1092": 1140, + "1093": 1141, + "1094": 1142, + "1095": 1143, + "1096": 1144, + "1097": 1145, + "1098": 1146, + "1099": 1147, + "1100": 1148, + "1101": 1149, + "1102": 1150, + "1103": 1151, + "1104": 1152, + "1105": 1153, + "1106": 1154, + "1107": 1155, + "1108": 1156, + "1109": 1157, + "1110": 1158, + "1111": 1159, + "1112": 1160, + "1113": 1161, + "1114": 1162, + "1115": 1163, + "1116": 1164, + "1117": 1165, + "1118": 1166, + "1119": 1167, + "1120": 1168, + "1121": 1169, + "1122": 1170, + "1123": 1171, + "1124": 1172, + "1125": 1173, + "1126": 1174, + "1127": 1175, + "1128": 1176, + "1129": 1177, + "1130": 1178, + "1131": 1179, + "1132": 1180, + "1133": 1181, + "1134": 1182, + "1135": 1183, + "1136": 1184, + "1137": 1185, + "1138": 1186, + "1139": 1187, + "1140": 1189, + "1141": 1190, + "1142": 1191, + "1143": 1192, + "1144": 1193, + "1145": 1194, + "1146": 1195, + "1147": 1196, + "1148": 1197, + "1149": 1198, + "1150": 1199, + "1151": 1200, + "1152": 1201, + "1153": 1202, + "1154": 1203, + "1155": 1204, + "1156": 1205, + "1157": 1206, + "1158": 1207, + "1159": 1208, + "1160": 1209, + "1161": 1210, + "1162": 1211, + "1163": 1212, + "1164": 1213, + "1165": 1214, + "1166": 1215, + "1167": 1216, + "1168": 1217, + "1169": 1218, + "1170": 1219, + "1171": 1220, + "1172": 1221, + "1173": 1222, + "1174": 1223, + "1175": 1224, + "1176": 1225, + "1177": 1226, + "1178": 1227, + "1179": 1228, + "1180": 1229, + "1181": 1231, + "1182": 1232, + "1183": 1233, + "1184": 1234, + "1185": 1235, + "1186": 1236, + "1187": 1237, + "1188": 1238, + "1189": 1239, + "1190": 1240, + "1191": 1241, + "1192": 1242, + "1193": 1243, + "1194": 1244, + "1195": 1245, + "1196": 1246, + "1197": 1247, + "1198": 1248, + "1199": 1249, + "1200": 1250, + "1201": 1251, + "1202": 1252, + "1203": 1253, + "1204": 1254, + "1205": 1255, + "1206": 1256, + "1207": 1257, + "1208": 1258, + "1209": 1259, + "1210": 1260, + "1211": 1261, + "1212": 1262, + "1213": 1263, + "1214": 1264, + "1215": 1265, + "1216": 1266, + "1217": 1267, + "1218": 1268, + "1219": 1269, + "1220": 1270, + "1221": 1271, + "1222": 1272, + "1223": 1273, + "1224": 1274, + "1225": 1275, + "1226": 1276, + "1227": 1277, + "1228": 1278, + "1229": 1279, + "1230": 1280, + "1231": 1281, + "1232": 1282, + "1233": 1283, + "1234": 1284, + "1235": 1285, + "1236": 1286, + "1237": 1287, + "1238": 1288, + "1239": 1289, + "1240": 1290, + "1241": 1291, + "1242": 1292, + "1243": 1293, + "1244": 1294, + "1245": 1295, + "1246": 1296, + "1247": 1297, + "1248": 1298, + "1249": 1299, + "1250": 1301, + "1251": 1302, + "1252": 1303, + "1253": 1304, + "1254": 1305, + "1255": 1306, + "1256": 1307, + "1257": 1308, + "1258": 1309, + "1259": 1310, + "1260": 1311, + "1261": 1312, + "1262": 1313, + "1263": 1314, + "1264": 1315, + "1265": 1316, + "1266": 1317, + "1267": 1318, + "1268": 1319, + "1269": 1320, + "1270": 1321, + "1271": 1322, + "1272": 1323, + "1273": 1324, + "1274": 1325, + "1275": 1326, + "1276": 1327, + "1277": 1328, + "1278": 1329, + "1279": 1330, + "1280": 1331, + "1281": 1332, + "1282": 1333, + "1283": 1334, + "1284": 1335, + "1285": 1336, + "1286": 1337, + "1287": 1338, + "1288": 1339, + "1289": 1340, + "1290": 1341, + "1291": 1342, + "1292": 1343, + "1293": 1344, + "1294": 1345, + "1295": 1346, + "1296": 1347, + "1297": 1348, + "1298": 1349, + "1299": 1350, + "1300": 1351, + "1301": 1352, + "1302": 1353, + "1303": 1354, + "1304": 1355, + "1305": 1356, + "1306": 1357, + "1307": 1358, + "1308": 1359, + "1309": 1360, + "1310": 1361, + "1311": 1362, + "1312": 1363, + "1313": 1364, + "1314": 1365, + "1315": 1366, + "1316": 1367, + "1317": 1368, + "1318": 1369, + "1319": 1370, + "1320": 1371, + "1321": 1372, + "1322": 1373, + "1323": 1374, + "1324": 1375, + "1325": 1376, + "1326": 1377, + "1327": 1378, + "1328": 1379, + "1329": 1380, + "1330": 1381, + "1331": 1382, + "1332": 1383, + "1333": 1384, + "1334": 1385, + "1335": 1386, + "1336": 1387, + "1337": 1388, + "1338": 1389, + "1339": 1390, + "1340": 1391, + "1341": 1392, + "1342": 1393, + "1343": 1394, + "1344": 1395, + "1345": 1396, + "1346": 1397, + "1347": 1398, + "1348": 1399, + "1349": 1400, + "1350": 1401, + "1351": 1402, + "1352": 1403, + "1353": 1404, + "1354": 1405, + "1355": 1406, + "1356": 1407, + "1357": 1408, + "1358": 1409, + "1359": 1410, + "1360": 1411, + "1361": 1412, + "1362": 1413, + "1363": 1414, + "1364": 1415, + "1365": 1416, + "1366": 1418, + "1367": 1420, + "1368": 1421, + "1369": 1422, + "1370": 1423, + "1371": 1424, + "1372": 1425, + "1373": 1426, + "1374": 1427, + "1375": 1428, + "1376": 1429, + "1377": 1430, + "1378": 1431, + "1379": 1432, + "1380": 1433, + "1381": 1434, + "1382": 1435, + "1383": 1436, + "1384": 1437, + "1385": 1438, + "1386": 1439, + "1387": 1440, + "1388": 1441, + "1389": 1442, + "1390": 1443, + "1391": 1444, + "1392": 1445, + "1393": 1446, + "1394": 1447, + "1395": 1448, + "1396": 1449, + "1397": 1450, + "1398": 1451, + "1399": 1452, + "1400": 1453, + "1401": 1454, + "1402": 1455, + "1403": 1456, + "1404": 1457, + "1405": 1458, + "1406": 1459, + "1407": 1460, + "1408": 1461, + "1409": 1462, + "1410": 1463, + "1411": 1464, + "1412": 1465, + "1413": 1466, + "1414": 1467, + "1415": 1468, + "1416": 1469, + "1417": 1470, + "1418": 1471, + "1419": 1472, + "1420": 1473, + "1421": 1474, + "1422": 1475, + "1423": 1476, + "1424": 1477, + "1425": 1478, + "1426": 1479, + "1427": 1480, + "1428": 1481, + "1429": 1482, + "1430": 1483, + "1431": 1484, + "1432": 1485, + "1433": 1486, + "1434": 1487, + "1435": 1488, + "1436": 1489, + "1437": 1490, + "1438": 1491, + "1439": 1492, + "1440": 1493, + "1441": 1494, + "1442": 1495, + "1443": 1496, + "1444": 1497, + "1445": 1498, + "1446": 1499, + "1447": 1500, + "1448": 1501, + "1449": 1502, + "1450": 1503, + "1451": 1504, + "1452": 1505, + "1453": 1506, + "1454": 1507, + "1455": 1508, + "1456": 1509, + "1457": 1510, + "1458": 1511, + "1459": 1512, + "1460": 1513, + "1461": 1514, + "1462": 1515, + "1463": 1516, + "1464": 1517, + "1465": 1518, + "1466": 1519, + "1467": 1520, + "1468": 1521, + "1469": 1522, + "1470": 1523, + "1471": 1524, + "1472": 1525, + "1473": 1526, + "1474": 1527, + "1475": 1528, + "1476": 1529, + "1477": 1530, + "1478": 1531, + "1479": 1532, + "1480": 1533, + "1481": 1534, + "1482": 1535, + "1483": 1536, + "1484": 1537, + "1485": 1538, + "1486": 1539, + "1487": 1540, + "1488": 1541, + "1489": 1542, + "1490": 1543, + "1491": 1545, + "1492": 1547, + "1493": 1548, + "1494": 1549, + "1495": 1550, + "1496": 1551, + "1497": 1552, + "1498": 1553, + "1499": 1554, + "1500": 1555, + "1501": 1556, + "1502": 1557, + "1503": 1558, + "1504": 1559, + "1505": 1560, + "1506": 1561, + "1507": 1562, + "1508": 1563, + "1509": 1565, + "1510": 1566 +} \ No newline at end of file diff --git a/data/corpora/cg/debug/PMID-198130.txt b/data/corpora/cg/debug/PMID-198130.txt new file mode 100644 index 0000000..09918d5 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.txt @@ -0,0 +1,6 @@ +Magnitude of malate - aspartate reduced nicotinamide adenine dinucleotide shuttle activity in intact respiring tumor cells . +Measurements of respiration , CO2 and lactate production , and changes in the levels of various key metabolites of the glycolytic sequence and tricarboxylic acid cycle were made on five lines of rodent ascites tumor cells ( two strains of Ehrlich ascites tumor cells , Krebs II carcinoma , AS - 30D carcinoma , and L1210 cells ) incubated aerobically in the presence of uniformly labeled D - [ 14C ] glucose . +From these data , as well as earlier evidence demonstrating that the reduced nicotinamide adenine dinucleotide ( NADH ) shuttle in these cells requires a transaminase step and is thus identified as the malate - aspartate shuttle ( W . V . V . Greenhouse and A . L . Lehninger , Cancer Res . , 36 : 1392 - 1396 , 1976 ) , metabolic flux diagrams were constructed for the five cell lines . +These diagrams show the relative rates of glycolysis , the tricarboxylic acid cycle , electron transport , and the malate - aspartate shuttle in these tumors . +Large amounts of cytosolic NADH were oxidized by the mitochondrial respiratory chain via the NADH shuttle , comprising anywhere from about 20 to 80 % of the total flow of reducing equivalents to oxygen in these tumors . +Calculations of the sources of energy for adenosine triphosphate synthesis indicated that on the average about one - third of the respiratory adenosine triphosphate is generated by electron flow originating from cytosolic NADH via the malate - aspartate shuttle . \ No newline at end of file diff --git a/data/corpora/cg/debug/PMID-198130.txt.ori b/data/corpora/cg/debug/PMID-198130.txt.ori new file mode 100644 index 0000000..87d7ea0 --- /dev/null +++ b/data/corpora/cg/debug/PMID-198130.txt.ori @@ -0,0 +1,2 @@ +Magnitude of malate-aspartate reduced nicotinamide adenine dinucleotide shuttle activity in intact respiring tumor cells. +Measurements of respiration, CO2 and lactate production, and changes in the levels of various key metabolites of the glycolytic sequence and tricarboxylic acid cycle were made on five lines of rodent ascites tumor cells (two strains of Ehrlich ascites tumor cells, Krebs II carcinoma, AS-30D carcinoma, and L1210 cells) incubated aerobically in the presence of uniformly labeled D-[14C]glucose. From these data, as well as earlier evidence demonstrating that the reduced nicotinamide adenine dinucleotide (NADH) shuttle in these cells requires a transaminase step and is thus identified as the malate-aspartate shuttle (W.V.V. Greenhouse and A.L. Lehninger, Cancer Res., 36: 1392-1396, 1976), metabolic flux diagrams were constructed for the five cell lines. These diagrams show the relative rates of glycolysis, the tricarboxylic acid cycle, electron transport, and the malate-aspartate shuttle in these tumors. Large amounts of cytosolic NADH were oxidized by the mitochondrial respiratory chain via the NADH shuttle, comprising anywhere from about 20 to 80% of the total flow of reducing equivalents to oxygen in these tumors. Calculations of the sources of energy for adenosine triphosphate synthesis indicated that on the average about one-third of the respiratory adenosine triphosphate is generated by electron flow originating from cytosolic NADH via the malate-aspartate shuttle. From dc8c5356dd69b1016f4c4a824384f7105ba6636f Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:33:23 +0900 Subject: [PATCH 14/70] training model --- eval/evalEV.py | 381 ++++----- eval/evalNER.py | 210 +++++ eval/evalRE.py | 414 +++++++--- eval/evaluate.py | 413 +++++++++- eval/scripts/n2c2.py | 1511 +++++++++++++++++++++++++++++++++++ loader/prepData/brat.py | 164 +++- loader/prepData/entity.py | 132 ++- loader/prepData/event.py | 270 +++++++ loader/prepData/prepdata.py | 47 +- loader/prepData/relation.py | 118 +++ loader/prepData/sentence.py | 105 ++- loader/prepNN/ent2net.py | 41 +- loader/prepNN/ev2net.py | 306 +++++++ loader/prepNN/mapping.py | 252 ++++++ loader/prepNN/prep4nn.py | 61 +- loader/prepNN/rel2net.py | 23 + loader/prepNN/sent2net.py | 6 +- loader/prepNN/span4nn.py | 79 +- loader/prepNN/structure.py | 283 +++++++ {nets => model}/EVGen.py | 223 +++++- {nets => model}/EVNet.py | 209 +++-- {nets => model}/NERNet.py | 110 ++- model/RELNet.py | 309 +++++++ model/deepEM.py | 484 +++++++++++ model/training.py | 225 ++++++ nets/RELNet.py | 147 ---- nets/deepEM.py | 298 ------- train.py | 215 +++++ 28 files changed, 6089 insertions(+), 947 deletions(-) create mode 100644 eval/evalNER.py create mode 100644 eval/scripts/n2c2.py create mode 100644 loader/prepData/event.py create mode 100644 loader/prepData/relation.py create mode 100644 loader/prepNN/ev2net.py create mode 100644 loader/prepNN/mapping.py create mode 100644 loader/prepNN/rel2net.py create mode 100644 loader/prepNN/structure.py rename {nets => model}/EVGen.py (85%) rename {nets => model}/EVNet.py (80%) rename {nets => model}/NERNet.py (70%) create mode 100644 model/RELNet.py create mode 100644 model/deepEM.py create mode 100644 model/training.py delete mode 100644 nets/RELNet.py delete mode 100644 nets/deepEM.py create mode 100644 train.py diff --git a/eval/evalEV.py b/eval/evalEV.py index 5b6725c..7590b4a 100644 --- a/eval/evalEV.py +++ b/eval/evalEV.py @@ -1,6 +1,8 @@ import collections import os +from loguru import logger + def get_entity_attrs(e_idx, words, offset, span_indices, sub_to_words): e_span_indice = span_indices[e_idx] @@ -67,6 +69,7 @@ def generate_events(fids, all_ev_preds, params): # accumulated event numbers to count event id acc_evid = 0 + # ev_count = 0 # store event ids evids_ = collections.OrderedDict() @@ -81,8 +84,15 @@ def generate_events(fids, all_ev_preds, params): # set event id ev_id = xx1 + acc_evid + # if level == 0: + # ev_id = xx1 + # ev_id = acc_evid + xx1 + # else: + # ev_id = level * len(ev_preds_levels_[level-1]) + xx1 + # ev_id = acc_evid + xx1 ev_id_str = (str(xi) + '_' + str(ev_id)) + # ev_id_ = (xi, ev_id) # store evid for nested events evids_[(level, xx1)] = ev_id_str @@ -103,9 +113,15 @@ def generate_events(fids, all_ev_preds, params): ev_data.append((xi, (trid[0], trid[1]))) # get relation structure + # rel_struct_counter = rel_struct_[0] rel_struct_list = rel_struct_[1] + # check no-argument + # if len(a2ids) == 0: + # continue + # has argument + # if len(a2ids) > 0: if len(rel_struct_list) > 0: # store args_data @@ -116,6 +132,8 @@ def generate_events(fids, all_ev_preds, params): for argid, a2id in enumerate(a2ids): + # print(argid, rel_struct_list, rel_struct_counter, trid, a2ids) + # get relation type id rel_group = rel_struct_list[argid] # (rtypeid, argtypeid) rtypeid = rel_group[0] @@ -127,12 +145,17 @@ def generate_events(fids, all_ev_preds, params): else: dup_rtypes[rtypeid] += 1 + # create id for a2 + # check whether this is entity or event argument + # event argument if level > 0 and len(a2id) > 2: - + # evlevel = a2id[1] + # evxx1 = a2id[2] evlevel_id = a2id[2] # look up in the event ids list + # added_evid = evids_[(evlevel, evxx1)] added_evid = evids_[evlevel_id] a2bid = (added_evid, -1, -1) # add -1 to mark the event argument @@ -292,253 +315,105 @@ def convert_evid_to_number(str_evid): return int(evid[0] + evid[1]) -def mapping_entity_id(en_preds_, g_entity_ids_, rev_type_map, params): - # if gold entity, starting trigger id from max entity id + 1 - if not params['ner_predict_all'] and len(g_entity_ids_) > 0: - eid = g_entity_ids_[-1] + 1 - - # predict both entity and trigger - else: - eid = 1 - - # mapping - enid_mapping = collections.OrderedDict() - en_preds_out_ = [] - - # entity in a2 - a2_ents_ = [] - - # create mapping for entity id first - for pr_id, en_pred in en_preds_.items(): - - # id - en_id = en_pred[0] - - if en_id.startswith('TR'): - continue - - elif en_id.startswith('T'): - enid_mapping[en_id] = 'T' + str(eid) - eid += 1 - en_preds_out_.append(en_pred) - - # using gold entity but in a2 - if not params['ner_predict_all']: - etype = rev_type_map[en_pred[1]] - - # check entity type in a2 - if etype in params['a2_entities']: - a2_ents_.append(en_id) - - # creat mapping for trigger id - for pr_id, en_pred in en_preds_.items(): - # id - en_id = en_pred[0] - - if en_id.startswith('TR'): - enid_mapping[en_id] = 'T' + str(eid) - eid += 1 - en_preds_out_.append(en_pred) - - return enid_mapping, en_preds_out_, a2_ents_ - - # write events to file -def write_ev_2file(pred_output, pred_ents, result_dir, g_entity_ids_, params): - a2dir = result_dir + 'ev-last/ev-tok-a2/' - anndir = result_dir + 'ev-last/ev-tok-ann/' +def write_ev_2file(pred_output, result_dir, params): rev_type_map = params['mappings']['rev_type_map'] - # entity id mapping - # feid_mapping = collections.OrderedDict() - - if not os.path.exists(a2dir): - os.makedirs(a2dir) + dir2wr = result_dir + 'ev-last/ev-ann/' + if not os.path.exists(dir2wr): + os.makedirs(dir2wr) else: - os.system('rm ' + a2dir + '*.a2') + os.system('rm ' + dir2wr + '*.a2') - if not os.path.exists(anndir): - os.makedirs(anndir) - else: - os.system('rm ' + anndir + '*.a2') - os.system('rm ' + anndir + '*.a1') - - # write event and triggers, (and entity: if predict both entity and trigger) for fid, preds in pred_output.items(): - ev_en_preds_ = preds[0] + triggers = preds[0] events = preds[1] - enid_mapping, en_preds_out_, a2_ents_ = mapping_entity_id(pred_ents[fid], g_entity_ids_[fid], rev_type_map, params) - - # entity and trigger for ann file - ann_en_lines = [] - ann_tr_lines = [] - # write entity and trigger from entity predictions - for pr_id, e_pred in pred_ents[fid].items(): - e0_id = e_pred[0] - e_id = enid_mapping[e0_id] - - output = ''.join( - [e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t', - e_pred[3], '\n']) + with open(dir2wr + fid + '.a2', 'w') as o2file: - if e0_id.startswith('TR'): - ann_tr_lines.append(output) + for trigger in triggers: + o2file.write(trigger[0].replace('TR', 'T') + '\t' + rev_type_map[trigger[1]] + ' ' + + str(trigger[2][0]) + ' ' + str(trigger[2][1]) + '\t' + trigger[3] + '\n') - # only write entity to a1 - elif e0_id.startswith('T'): - - # entity in a2 - if e0_id in a2_ents_: - ann_tr_lines.append(output) - - else: - ann_en_lines.append(output) + # count event id + f_evid = 0 + # mapping event id to incremental id + f_evid_map = collections.OrderedDict() - # entity and trigger output for a2 - a2_en_lines_ = [] - a2_tr_lines_ = [] + # store modality + mod_list = [] - # write entity and trigger only included event predictions - # write entity and then trigger - for e_pred in ev_en_preds_: - e0_id = e_pred[0] - e_id = enid_mapping[e0_id] + for event_ in events: - output = ''.join( - [e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t', - e_pred[3], '\n']) - - if e0_id.startswith('TR'): - a2_tr_lines_.append(output) - elif e0_id.startswith('T'): - - # entity in a2 - if e0_id in a2_ents_: - a2_tr_lines_.append(output) + # create event id + evid = convert_evid_to_number(event_[0]) + # lookup in the map or create a new id + if evid in f_evid_map: + evid_out = f_evid_map[evid] else: - a2_en_lines_.append(output) - - # event output - ev_lines = [] - - # count event id - f_evid = 0 - - # mapping event id to incremental id - f_evid_map = collections.OrderedDict() - - # store modality - mod_list = [] - - for event_ in events: - - # create event id - evid = convert_evid_to_number(event_[0]) + f_evid += 1 + evid_out = f_evid + f_evid_map[evid] = evid_out + + idTR = event_[1][0].replace('TR', 'T') + typeEV = rev_type_map[event_[1][1]] + args_data = event_[2] + mod_pred = event_[3] + + args_output = '' + for arg_ in args_data: + + # relation type + typeR = arg_[0] + + # check event or entity argument + if len(arg_) > 2: + argIdE = arg_[1] + nest_evid = convert_evid_to_number(argIdE) + if nest_evid in f_evid_map: + nest_evid_out = f_evid_map[nest_evid] + idT = 'E' + str(nest_evid_out) + else: + print('ERROR: NESTED EVENT BUT MISSING EVENT ARGUMENT.') - # lookup in the map or create a new id - if evid in f_evid_map: - evid_out = f_evid_map[evid] - else: - f_evid += 1 - evid_out = f_evid - f_evid_map[evid] = evid_out - - trid = event_[1][0] - trid = enid_mapping[trid] - typeEV = rev_type_map[event_[1][1]] - args_data = event_[2] - mod_pred = event_[3] - - args_output = '' - for arg_ in args_data: - - # relation type - typeR = arg_[0] - - # check event or entity argument - if len(arg_) > 2: - argIdE = arg_[1] - nest_evid = convert_evid_to_number(argIdE) - if nest_evid in f_evid_map: - nest_evid_out = f_evid_map[nest_evid] - eid = 'E' + str(nest_evid_out) + # entity argument else: - print('ERROR: NESTED EVENT BUT MISSING EVENT ARGUMENT.') + a2data = arg_[1] + idT = a2data[0].replace('TR', 'T') - # entity argument - else: - a2data = arg_[1] - eid = a2data[0] + if len(args_output) > 0: + args_output += ' ' - # mapping entity id: predict entity or entity in a2 - if params['ner_predict_all'] or eid in a2_ents_: - eid = enid_mapping[eid] + args_output += typeR + ':' + idT + # if has argument if len(args_output) > 0: - args_output += ' ' + o2file.write('E' + str(evid_out) + '\t' + typeEV + ':' + idTR + ' ' + args_output + '\n') - args_output += typeR + ':' + eid + # no argument + else: + o2file.write('E' + str(evid_out) + '\t' + typeEV + ':' + idTR + '\n') - # if has argument - if len(args_output) > 0: - output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, ' ', args_output, '\n']) - ev_lines.append(output) + # check and store modality + if mod_pred > 1: + mod_value = params['mappings']['rev_modality_map'][mod_pred] + mod_list.append([mod_value, evid_out]) - # no argument - else: - output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, '\n']) - ev_lines.append(output) - - # check and store modality - if mod_pred > 1: - mod_value = params['mappings']['rev_modality_map'][mod_pred] - mod_list.append([mod_value, evid_out]) - - # write modality - if len(mod_list) > 0: - for mod_id, mod_data in enumerate(mod_list): - mod_type = mod_data[0] - evid_out = mod_data[1] - output = ''.join(['M', str(mod_id + 1), '\t', mod_type, ' ', 'E', str(evid_out), '\n']) - ev_lines.append(output) - - # write a2 files - with open(a2dir + fid + '.a2', 'w') as o2file: - - # write entity - if params['ner_predict_all']: - for entity in a2_en_lines_: - o2file.write(entity) - - for trigger in a2_tr_lines_: - o2file.write(trigger) - for event in ev_lines: - o2file.write(event) - - # write ann file - with open(anndir + fid + '.a1', 'w') as o1file: - for entity in ann_en_lines: - o1file.write(entity) - - with open(anndir + fid + '.a2', 'w') as annfile: - for entity in ann_en_lines: - annfile.write(entity) - for trigger in ann_tr_lines: - annfile.write(trigger) - - # events are the same for both a2 and ann - for event in ev_lines: - annfile.write(event) + # write modality + if len(mod_list) > 0: + for mod_id, mod_data in enumerate(mod_list): + mod_type = mod_data[0] + evid_out = mod_data[1] + o2file.write('M' + str(mod_id + 1) + '\t' + mod_type + ' ' + 'E' + str(evid_out) + '\n') return # generate event output and evaluation -def write_events(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words, - all_ev_preds, g_entity_ids_, params, result_dir): +def evaluate_ev(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words, + all_ev_preds, params, gold_dir, result_dir): # generate predicted entities pred_ents = generate_entities(fids=fids, all_e_preds=all_ent_preds, @@ -558,6 +433,70 @@ def write_events(fids, all_ent_preds, all_words, all_offsets, all_span_terms, al preds_output = generate_ev_output(pred_ents, pred_evs, params) # write output to file - write_ev_2file(preds_output, pred_ents, result_dir, g_entity_ids_, params) - - return + _ = write_ev_2file(preds_output, result_dir, params) + + # calculate score + ev_scores = eval_performance(gold_dir, result_dir, params) + + return ev_scores + + +def eval_performance(ref_dir, result_dir, params): + # create prediction paths + pred_dir = ''.join([result_dir, 'ev-last/ev-ann/']) + pred_scores_file = ''.join([result_dir, 'ev-last/', 'ev-scores-', params['task_name'], params['ev_matching'], '.txt']) + + try: + + command = ''.join( + ["python " + params['ev_eval_script_path'], " -r ", ref_dir, " -d ", pred_dir, " ", params['ev_matching'], + " > ", pred_scores_file]) + + # exception for ezcat task + if 'ezcat' in params['task_name']: + command = ''.join( + ["python " + params['ev_eval_script_path'], " -r ", ref_dir, " ", pred_dir, " ", + params['ev_matching'], + " > ", pred_scores_file]) + + os.system(command) + ev_scores = extract_fscore(pred_scores_file) + except Exception as ex: + ev_scores = {} + logger.exception(ex) + + return ev_scores + + +def extract_fscore(path): + file = open(path, 'r') + lines = file.readlines() + sub_fscore = '0' + sub_recall = '0' + sub_precision = '0' + mod_fscore = '0' + mod_recall = '0' + mod_precision = '0' + tot_fscore = '0' + tot_recall = '0' + tot_precision = '0' + for line in lines: + if line.split()[0] == '===[SUB-TOTAL]===': + tokens = line.split() + sub_recall = tokens[-3] + sub_precision = tokens[-2] + sub_fscore = tokens[-1] + elif line.split()[0] == '==[MOD-TOTAL]==': + tokens = line.split() + mod_recall = tokens[-3] + mod_precision = tokens[-2] + mod_fscore = tokens[-1] + elif line.split()[0] == '====[TOTAL]====': + tokens = line.split() + tot_recall = tokens[-3] + tot_precision = tokens[-2] + tot_fscore = tokens[-1] + + return {'sub_scores': (float(sub_precision.strip()), float(sub_recall.strip()), float(sub_fscore.strip())), + 'mod_scores': (float(mod_precision.strip()), float(mod_recall.strip()), float(mod_fscore.strip())), + 'tot_scores': (float(tot_precision.strip()), float(tot_recall.strip()), float(tot_fscore.strip()))} diff --git a/eval/evalNER.py b/eval/evalNER.py new file mode 100644 index 0000000..c26ed0a --- /dev/null +++ b/eval/evalNER.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +# © Khoa Duong (dnanhkhoa@live.com) +from collections import defaultdict + +import texttable + + +def precision(tp=0, fp=0): + if tp + fp: + return tp / (tp + fp) + return 0.0 + + +def recall(tp=0, fn=0): + if tp + fn: + return tp / (tp + fn) + return 0.0 + + +def f_score(precision, recall, beta=1.0): + """ + The beta parameter determines the weight of precision in the combined score. beta < 1 lends more weight to precision, while beta > 1 favors recall (beta -> 0 considers only precision, beta -> inf only recall). + """ + numerator = (1 + beta ** 2) * precision * recall + denominator = beta ** 2 * precision + recall + if denominator: + return numerator / denominator + return 0.0 + + +def specificity(tn=0, fp=0): + # FPR + if tn + fp: + return tn / (tn + fp) + return 0.0 + + +def sensitivity(tp=0, fn=0): + # TPR + return recall(tp, fn) + + +def auc(specificity, sensitivity): + return (specificity + sensitivity) / 2 + + +def measure(tp=0, tn=0, fp=0, fn=0, beta=1.0, lenient=False): + if lenient and tp + fn == 0: + tp, tn, fp, fn = 1, 1, 0, 0 + + _precision = precision(tp, fp) + _recall = recall(tp, fn) + _f_score = f_score(_precision, _recall, beta) + _specificity = specificity(tn, fp) + _sensitivity = sensitivity(tp, fn) + _auc = auc(_specificity, _sensitivity) + + return { + "precision": _precision * 100, + "recall": _recall * 100, + "f_score": _f_score * 100, + "specificity": _specificity * 100, + "sensitivity": _sensitivity * 100, + "auc": _auc * 100, + } + + +def count(pred_entities, gold_entities, label): + assert label, "Label is invalid" + + # Remove duplicates + pred_entities = {e: True for e in pred_entities} + gold_entities = {e: True for e in gold_entities} + + positions = {**pred_entities, **gold_entities} + + padded_pred_entities, padded_gold_entities = [], [] + + for k in positions: + if k in pred_entities and k[-1] == label: + padded_pred_entities.append(k[-1]) + else: + padded_pred_entities.append(None) + + if k in gold_entities and k[-1] == label: + padded_gold_entities.append(k[-1]) + else: + padded_gold_entities.append(None) + + matches = list(zip(padded_pred_entities, padded_gold_entities)) + + return { + "tp": matches.count((label, label)), + "tn": matches.count((None, None)), + "fp": matches.count((label, None)), + "fn": matches.count((None, label)), + } + + +def eval_nner(preds, golds, labels, beta=1.0, lenient=False): + num_pred_sentences = len(preds) + num_gold_sentences = len(golds) + assert num_pred_sentences == num_gold_sentences + + all_scores = [] + counts = [defaultdict(int) for _ in labels] + + for sentence_id in range(num_gold_sentences): + pred_entities = set(preds[sentence_id]) + gold_entities = set(golds[sentence_id]) + + for label_id, label in enumerate(labels): + for k, v in count(pred_entities, gold_entities, label).items(): + counts[label_id][k] += v + + tt = texttable.Texttable() + tt.set_cols_width([28] + [10] * 6 + [10] * 3) + tt.set_cols_dtype(["t", "f", "f", "f", "f", "f", "f", "i", "i", "i"]) + tt.set_cols_align(["l"] * 10) + tt.header( + [ + "Labels", + "Prec.", + "Rec.", + "F(b={})".format(beta), + "Speci.", + "Sensi.", + "AUC", + "Pred.", + "Gold.", + "Corr.", + ] + ) + + total_counts = defaultdict(int) + + for label_id, label in enumerate(labels): + score = measure( + counts[label_id]["tp"], + counts[label_id]["tn"], + counts[label_id]["fp"], + counts[label_id]["fn"], + beta, + lenient, + ) + + total_counts["tp"] += counts[label_id]["tp"] + total_counts["tn"] += counts[label_id]["tn"] + total_counts["fp"] += counts[label_id]["fp"] + total_counts["fn"] += counts[label_id]["fn"] + + all_scores.append( + [ + label, + score["precision"], + score["recall"], + score["f_score"], + score["specificity"], + score["sensitivity"], + score["auc"], + counts[label_id]["tp"] + counts[label_id]["fp"], + counts[label_id]["tp"] + counts[label_id]["fn"], + counts[label_id]["tp"], + ] + ) + tt.add_row(all_scores[-1]) + + score = measure( + total_counts["tp"], + total_counts["tn"], + total_counts["fp"], + total_counts["fn"], + beta, + lenient, + ) + + all_scores.append( + [ + "Overall", + score["precision"], + score["recall"], + score["f_score"], + score["specificity"], + score["sensitivity"], + score["auc"], + total_counts["tp"] + total_counts["fp"], + total_counts["tp"] + total_counts["fn"], + total_counts["tp"], + ] + ) + tt.add_row(all_scores[-1]) + + return tt.draw(), all_scores + + +if __name__ == "__main__": + labels = ["A", "B", "C", "D", "E"] # DO NOT INCLUDE "O" + golds = [ + [(0, 2, "E"), (0, 2, "A"), (1, 3, "B"), (4, 6, "D")], + [(0, 1, "C"), (3, 4, "A")], + [(2, 3, "B"), (3, 4, "A"), (4, 5, "C"), (4, 5, "A")], + ] + preds = [ + [(0, 2, "E"), (0, 2, "A"), (1, 3, "C"), (4, 6, "D"), (4, 6, "D")], + [(0, 1, "C"), (3, 4, "A")], + [(2, 3, "B"), (3, 4, "A"), (4, 5, "D"), (4, 5, "A")], + ] + res, score = eval_nner(preds, golds, labels) + print(res) + print(score) diff --git a/eval/evalRE.py b/eval/evalRE.py index e46d2cd..ba3c077 100644 --- a/eval/evalRE.py +++ b/eval/evalRE.py @@ -1,11 +1,54 @@ -import torch import os -import collections from collections import defaultdict +import numpy as np +import torch +from tabulate import tabulate + from utils.utils import write_lines +class MeasureStatistics: + """ + Calculate: True Positives (TP), False Positives (FP), False Negatives (FN) + GPU & CPU code + """ + + def __init__(self, params, beta): + self.params = params + self.beta = beta + + def __call__(self, *inputs): + label_num = self.params['voc_sizes']['rel_size'] + ignore_label = self.params['lab2ign_id'] + y, t = inputs + + if label_num is None: + label_num = torch.max(t) + 1 + else: + label_num = torch.tensor(label_num) + + mask_t = (t == ignore_label).view(-1) # where the ground truth needs to be ignored + true = torch.where(mask_t, label_num, t.view(-1)) # t: ground truth labels (replace ignored with 13) + mask_p = (y == ignore_label).view(-1) # where the predicted needs to be ignored + pred = torch.where(mask_p, label_num, y.view(-1)) # y: output of neural network (replace ignored with 13) + + tp_mask = torch.where(pred == true, true, label_num) + fp_mask = torch.where(pred != true, pred, label_num) + fn_mask = torch.where(pred != true, true, label_num) + + try: + tp = torch.bincount(tp_mask, minlength=label_num + 1)[:label_num] + fp = torch.bincount(fp_mask, minlength=label_num + 1)[:label_num] + fn = torch.bincount(fn_mask, minlength=label_num + 1)[:label_num] + except: + tp = torch.zeros(label_num) + fp = torch.zeros(label_num) + fn = torch.zeros(label_num) + + return tp, fp, fn + + class SelectClass: """ Correct predictions: From 2 direction relations choose @@ -24,18 +67,40 @@ def __call__(self, *inputs): ignore = torch.tensor(self.params['lab2ign_id']) cpu_device = torch.device("cpu") - y_lr, y_rl = inputs + y_lr, y_rl, truth_lr, truth_rl = inputs y_lr = y_lr.to(cpu_device) y_rl = y_rl.to(cpu_device) if self.params['fp16']: y_lr = y_lr.float() y_rl = y_rl.float() + truth_lr = torch.tensor(truth_lr).long() + truth_rl = torch.tensor(truth_rl).long() + + no_rel_matched_indices = 0 + no_rel_matched_types = 0 + + try: + labels_lr = y_lr.argmax(dim=1).view(-1) + labels_rl = y_rl.argmax(dim=1).view(-1) + except: + return truth_lr, truth_lr, {'no_rel_matched_indices': no_rel_matched_indices, + 'no_rel_matched_types': no_rel_matched_types} + m = torch.arange(labels_lr.shape[0]) - labels_lr = y_lr.argmax(dim=1).view(-1) - labels_rl = y_rl.argmax(dim=1).view(-1) + # count rel matched indices / types + if not self.params['predict']: + lr_ids = (truth_lr != -1).nonzero().transpose(0, 1) + rl_ids = (truth_rl != -1).nonzero().transpose(0, 1) - m = torch.arange(labels_lr.shape[0]) + no_rel_matched_indices += (lr_ids.shape[1] + rl_ids.shape[1]) + + lr_rel_matched_types = labels_lr[lr_ids] - truth_lr[lr_ids] + rl_rel_matched_types = labels_rl[rl_ids] - truth_rl[rl_ids] + + no_rel_matched_types += ( + (lr_rel_matched_types == 0).nonzero().shape[0] + (rl_rel_matched_types == 0).nonzero().shape[0]) + # split predictions into 2 arrays: relations + inv-relations lr_probs = y_lr[m, labels_lr] rl_probs = y_rl[m, labels_rl] inv_lr = labmap[labels_lr] @@ -53,14 +118,20 @@ def __call__(self, *inputs): rl_probs, negative_val.float()) a_x4 = torch.where((a4 >= a5) & (a4 != -1) & (a5 != -1), labels_lr, negative_val) a_x5 = torch.where((a4 < a5) & (a4 != -1) & (a5 != -1), inv_rl, negative_val) + # a_x5 = torch.where((a4 < a5) & (a4 != -1) & (a5 != -1), labels_rl, negative_val) # # if both are positive with inverse 1:rel:2 & 2:rel:1 (this is correct) --> keep them the 'rel' label a_x6 = torch.where((labels_lr != labels_rl) & (labels_lr != ignore) & (labels_rl != ignore) & (inv_lr == labels_rl), labels_lr, negative_val) + # If we don't care LR, we don't need a_x6 + # a_x6 = torch.where((labels_lr != labels_rl) & (labels_lr != ignore) & + # (labels_rl != ignore) & (inv_even == odd_labels), even_labels, negative_val_long) # if one positive & one negative --> choose the positive class a_x2 = torch.where((labels_lr != labels_rl) & (labels_lr == ignore) & (labels_rl != ignore), inv_rl, negative_val) + # a_x2 = torch.where((labels_lr != labels_rl) & (labels_lr == ignore) & (labels_rl != ignore), + # labels_rl, negative_val) a_x3 = torch.where((labels_lr != labels_rl) & (labels_lr != ignore) & (labels_rl == ignore), labels_lr, negative_val) @@ -71,24 +142,149 @@ def __call__(self, *inputs): a8 = torch.where( (labels_lr != ignore) & (labels_rl != ignore) & (labels_lr != labels_rl) & (inv_lr != labels_rl), rl_probs, negative_val.float()) - + # a7 = torch.where((labels_lr != ignore) & (labels_rl != ignore) & (labels_lr != labels_rl), lr_probs, + # negative_val.float()) + # a8 = torch.where((labels_lr != ignore) & (labels_rl != ignore) & (labels_lr != labels_rl), rl_probs, + # negative_val.float()) a_x7 = torch.where((a7 >= a8) & (a7 != -1) & (a8 != -1), labels_lr, negative_val) a_x8 = torch.where((a7 < a8) & (a7 != -1) & (a8 != -1), inv_rl, negative_val) + # a_x8 = torch.where((a7 < a8) & (a7 != -1) & (a8 != -1), labels_rl, negative_val) fin = torch.stack([a_x1, a_x2, a_x3, a_x4, a_x5, a_x6, a_x7, a_x8]) + # fin = torch.stack([a_x1, a_x2, a_x3, a_x4, a_x5, a_x7, a_x8]) assert (torch.sum(torch.clamp(fin, min=-1.0, max=0.0), dim=0) == -7).all(), "check evaluation" + # assert (torch.sum(torch.clamp(fin, min=-1.0, max=0.0), dim=0) == -6).all(), "check evaluation" fin_preds = torch.max(fin, dim=0) + fin_truth = truth_lr + + return fin_preds[0], fin_truth, {'no_rel_matched_indices': no_rel_matched_indices, + 'no_rel_matched_types': no_rel_matched_types} - return fin_preds[0] +def calc_stats(preds, ts, params): + new_preds, new_ts, no_matched_rels = SelectClass(params)(preds[0], preds[1], ts[0], ts[1]) + tp_, fp_, fn_ = MeasureStatistics(params, 1.0)(new_preds, new_ts) + return new_preds, new_ts, no_matched_rels, tp_, fp_, fn_ -def calc_stats(preds, params): - new_preds = SelectClass(params)(preds[0], preds[1]) - return new_preds +def fbeta_score(precision, recall, beta=1.0): + beta_square = beta * beta + if (precision != 0.0) and (recall != 0.0): + res = ((1 + beta_square) * precision * recall / (beta_square * precision + recall)).astype(precision.dtype) + else: + res = 0.0 + return res + + +def estimate_perf(all_tp, all_fp, all_fn, params): + """ + Estimate performance: micro and macro average precision, recall, F1 score. + CPU - based + """ + lab_map = params['lab_map'] + class_size = params['voc_sizes']['rel_size'] + lab2ign = params['lab2ign_id'] + + all_tp = np.sum(all_tp, axis=0) + all_fp = np.sum(all_fp, axis=0) + all_fn = np.sum(all_fn, axis=0) + atp = np.sum(all_tp) + afp = np.sum(all_fp) + afn = np.sum(all_fn) + micro_p = (1.0 * atp) / (atp + afp) if (atp + afp != 0) else 0.0 + micro_r = (1.0 * atp) / (atp + afn) if (atp + afn != 0) else 0.0 + micro_f = fbeta_score(micro_p, micro_r) + + # macro (merge directions l2r+r2l) + ctp = [] + cfp = [] + cfn = [] + seen = [] + + # Without L R distinguishing + # for i in range(0, class_size): + # if i == lab2ign: # don't include other class + # continue + # elif (i in seen): + # continue + # else: + # ctp.append(all_tp[i]) + # cfp.append(all_fp[i]) + # cfn.append(all_fn[i]) + # seen.append(i) + + # With L R distinguishing + for i in range(0, class_size): + if i == lab2ign: # don't include other class + continue + elif (i in seen) or (lab_map[i] in seen): + continue + else: + ctp.append(all_tp[i] + all_tp[lab_map[i]]) + cfp.append(all_fp[i] + all_fp[lab_map[i]]) + cfn.append(all_fn[i] + all_fn[lab_map[i]]) + seen.append(i) + seen.append(lab_map[i]) + + pp = [] + rr = [] + ff = [] + for j in range(0, len(ctp)): + pp.append((1.0 * ctp[j]) / (ctp[j] + cfp[j]) if (ctp[j] + cfp[j]) != 0 else 0.0) + rr.append((1.0 * ctp[j]) / (ctp[j] + cfn[j]) if (ctp[j] + cfn[j]) != 0 else 0.0) + ff.append(fbeta_score(pp[j], rr[j])) + assert len(pp) == len(rr) == len(ff) + + # show performance on each class + if params['show_macro']: + gg = [ii for ii in range(0, class_size) if ii % 2 == 0][:-1] + lab_val = [] + for i in range(0, len(pp)): + lab_val.append([params['mappings']['rev_rel_map'][gg[i]].split(':')[1], pp[i], rr[i], ff[i]]) + print(tabulate(lab_val, headers=['Class', 'P', 'R', 'F1'], tablefmt='orgtbl')) + + macro_p = np.mean(pp) + macro_r = np.mean(rr) + macro_f = np.mean(ff) + return {'micro_p': micro_p, 'micro_r': micro_r, 'micro_f': micro_f, + 'macro_p': macro_p, 'macro_r': macro_r, 'macro_f': macro_f} + + +def get_entity_attrs(e_span_indice, words, offsets, sub_to_words): + e_words = [] + e_offset = [-1, -1] + curr_word_idx = -1 + for idx in range(e_span_indice[0], e_span_indice[1] + 1): + if sub_to_words[idx] != curr_word_idx: + e_words.append(words[sub_to_words[idx]]) + curr_word_idx = sub_to_words[idx] + if idx == e_span_indice[0]: + e_offset[0] = offsets[sub_to_words[idx]][0] + if idx == e_span_indice[1]: + e_offset[1] = offsets[sub_to_words[idx]][1] + return ' '.join(e_words), (e_offset[0], e_offset[1]) -def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): - # def gen_annotation(fidss, ent_anns, rel_anns, params, result_dir): + +def estimate_rel(ref_dir, result_dir, fids, ent_anns, rel_anns, params): + """Evaluate entity and relation performance using n2c2 script""" + + # generate brat prediction + gen_annotation(fids, ent_anns, rel_anns, params, result_dir) + + # calculate scores + pred_dir = ''.join([result_dir, 'rel-last/rel-ann/']) + pred_scores_file = ''.join([result_dir, 'rel-last/rel-scores-', params['ner_eval_corpus'], '.txt']) + + # run evaluation, output in the score file + eval_performance(ref_dir, pred_dir, result_dir, pred_scores_file, params) + + # extract scores + scores = extract_fscore(pred_scores_file) + + return scores + + +def gen_annotation(fidss, ent_anns, rel_anns, params, result_dir): """Generate entity and relation prediction""" dir2wr = ''.join([result_dir, 'rel-last/rel-ann/']) @@ -99,7 +295,6 @@ def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): # Initial ent+rel map map = defaultdict() - for fids in fidss: for fid in fids: map[fid] = {'ents': {}, 'rels': {}} @@ -116,11 +311,14 @@ def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): sub_to_words = ent_ann['sub_to_words'][xb] entities = map[fid]['ents'] + # e_count = len(entities) + 1 for x, pair in enumerate(span_indices): if pair[0].item() == -1: break if ner_preds[x] > 0: + # e_id = 'T' + str(e_count) + # e_count += 1 try: e_id = ner_terms.id2term[x] e_type = params['mappings']['rev_type_map'][ @@ -133,12 +331,10 @@ def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): e_words, e_offset = get_entity_attrs(pair, words, offsets, sub_to_words) else: e_words, e_offset = get_entity_attrs(pair, words, offsets, sub_to_words) - - # save entity map + # entity_map[(xb, (pair[0].item(), pair[1].item()))] = ( + # ner_preds[x], e_id, e_type, e_words, e_offset) entity_map[(xb, x)] = ( ner_preds[x], e_id, e_type, e_words, e_offset) - - # save entity dic info entities[e_id] = {"id": e_id, "type": e_type, "start": e_offset[0], "end": e_offset[1], "ref": e_words} except KeyError as error: @@ -147,7 +343,13 @@ def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): # Mapping relations pairs_idx = rel_ann['pairs_idx'] rel_preds = rel_ann['rel_preds'] + # positive_indices = rel_ann['positive_indices'] + # if positive_indices: + # pairs_idx_i = pairs_idx[0][positive_indices] + # pairs_idx_j = pairs_idx[1][positive_indices] + # pairs_idx_k = pairs_idx[2][positive_indices] + # else: pairs_idx_i = pairs_idx[0] pairs_idx_j = pairs_idx[1] pairs_idx_k = pairs_idx[2] @@ -160,8 +362,12 @@ def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): k = pairs_idx_k[x] rel = rel_preds[x].item() role = params['mappings']['rev_rel_map'][rel].split(":")[1] + # role = params['mappings']['rev_rtype_map'][rel] if role != 'Other': - + # arg1s = entity_map[ + # (i.item(), (ent_ann['span_indices'][i][j][0].item(), ent_ann['span_indices'][i][j][1].item()))] + # arg2s = entity_map[ + # (i.item(), (ent_ann['span_indices'][i][k][0].item(), ent_ann['span_indices'][i][k][1].item()))] try: arg1s = entity_map[(i.item(), j.item())] arg2s = entity_map[(i.item(), k.item())] @@ -181,78 +387,37 @@ def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): except KeyError as error: print('error relation', fids[i], error) - for fid, ners_rels in map.items(): - write_annotation_file(dir2wr, fid, entities=ners_rels['ents'], - relations=ners_rels['rels']) - - -def get_entity_attrs(e_span_indice, words, offsets, sub_to_words): - e_words = [] - e_offset = [-1, -1] - curr_word_idx = -1 - for idx in range(e_span_indice[0], e_span_indice[1] + 1): - if sub_to_words[idx] != curr_word_idx: - e_words.append(words[sub_to_words[idx]]) - curr_word_idx = sub_to_words[idx] - if idx == e_span_indice[0]: - e_offset[0] = offsets[sub_to_words[idx]][0] - if idx == e_span_indice[1]: - e_offset[1] = offsets[sub_to_words[idx]][1] - return ' '.join(e_words), (e_offset[0], e_offset[1]) - - -def mapping_entity_id(entities_): - eid = 1 - enid_mapping = collections.OrderedDict() - en_preds_out_ = [] - - # create mapping for entity id first - for en_id, en_data in entities_.items(): - - if en_id.startswith('TR'): - continue - - elif en_id.startswith('T'): - enid_mapping[en_id] = 'T' + str(eid) - eid += 1 - en_preds_out_.append(en_data) - - # creat mapping for trigger id - for en_id, en_data in entities_.items(): - - if en_id.startswith('TR'): - enid_mapping[en_id] = 'T' + str(eid) - eid += 1 - en_preds_out_.append(en_data) - - return enid_mapping, en_preds_out_ + # r_id = 'R' + str(r_count) + # r_count += 1 + # relations[r_id] = {"id": r_id, "role": role, + # "left_arg": {"label": "Arg1", "id": arg2}, + # "right_arg": {"label": "Arg2", "id": arg1}} + for fid, ners_rels in map.items(): + write_annotation_file(ann_file=dir2wr + fid + '.ann', entities=ners_rels['ents'], relations=ners_rels['rels']) -def write_annotation_file(dir2wr, fid, entities=None, relations=None): - re_lines = [] - en_lines = [] - tr_lines = [] - # entity id mapping - enid_mapping, en_preds_out_ = mapping_entity_id(entities) +def write_annotation_file( + ann_file, entities=None, triggers=None, relations=None, events=None +): + lines = [] - if entities: - for entity in en_preds_out_: + def annotate_text_bound(entities): + for entity in entities.values(): entity_annotation = "{}\t{} {} {}\t{}".format( - enid_mapping[entity["id"]], + entity["id"], entity["type"], entity["start"], entity["end"], entity["ref"], ) + lines.append(entity_annotation) - re_lines.append(entity_annotation) - - if entity["id"].startswith('TR'): - tr_lines.append(entity_annotation) + if entities: + annotate_text_bound(entities) - elif entity["id"].startswith('T'): - en_lines.append(entity_annotation) + if triggers: + annotate_text_bound(triggers) if relations: for relation in relations.values(): @@ -260,17 +425,86 @@ def write_annotation_file(dir2wr, fid, entities=None, relations=None): relation["id"], relation["role"], relation["left_arg"]["label"], - enid_mapping[relation["left_arg"]["id"]], + relation["left_arg"]["id"], relation["right_arg"]["label"], - enid_mapping[relation["right_arg"]["id"]], + relation["right_arg"]["id"], ) - re_lines.append(relation_annotation) + lines.append(relation_annotation) - # write to file - re_file = ''.join([dir2wr, fid, '-RE.ann']) - en_file = ''.join([dir2wr, fid, '-EN.ann']) - tr_file = ''.join([dir2wr, fid, '-TR.ann']) - - write_lines(re_lines, re_file) - write_lines(en_lines, en_file) - write_lines(tr_lines, tr_file) + if events: + for event in events.values(): + event_annotation = "{}\t{}:{}".format( + event["id"], event["trigger_type"], event["trigger_id"] + ) + for arg in event["args"]: + event_annotation += " {}:{}".format(arg["role"], arg["id"]) + lines.append(event_annotation) + + write_lines(lines, ann_file) + + +def eval_performance(ref_dir, pred_dir, result_dir, pred_scores_file, params): + # run evaluation script + + command = ''.join( + ["python ", params['rel_eval_script_path'], " --ner-eval-corpus ", params['ner_eval_corpus'], " ", ref_dir, " ", + pred_dir, " > ", pred_scores_file]) + os.system(command) + + # if predict: run for all config + if params['predict'] == True: + # entiy scores only + ner_eval_corpus = ''.join([params['task_name'], '_en']) + pred_scores_file = ''.join([result_dir, 'rel-last/rel-scores-', ner_eval_corpus, '.txt']) + command = ''.join( + ["python ", params['rel_eval_script_path'], " --ner-eval-corpus ", ner_eval_corpus, " ", ref_dir, + " ", + pred_dir, " > ", pred_scores_file]) + os.system(command) + + # trigger scores only + ner_eval_corpus = ''.join([params['task_name'], '_tr']) + pred_scores_file = ''.join([result_dir, 'rel-last/rel-scores-', ner_eval_corpus, '.txt']) + command = ''.join( + ["python ", params['rel_eval_script_path'], " --ner-eval-corpus ", ner_eval_corpus, " ", ref_dir, + " ", + pred_dir, " > ", pred_scores_file]) + os.system(command) + + +def extract_fscore(path): + file = open(path, 'r') + lines = file.readlines() + report = defaultdict() + report['NER'] = defaultdict() + report['REL'] = defaultdict() + + ent_or_rel = '' + for line in lines: + if '*' in line and 'TRACK' in line: + ent_or_rel = 'NER' + elif '*' in line and 'RELATIONS' in line: + ent_or_rel = 'REL' + elif len(line.split()) > 0 and line.split()[0] == 'Overall': + tokens = line.split() + if len(tokens) > 8: + strt_f, strt_r, strt_p, soft_f, soft_r, soft_p \ + = tokens[-7], tokens[-8], tokens[-9], tokens[-4], tokens[-5], tokens[-6] + else: + strt_f, strt_r, strt_p, soft_f, soft_r, soft_p \ + = tokens[-4], tokens[-5], tokens[-6], tokens[-1], tokens[-2], tokens[-3] + if line.split()[1] == '(micro)': + mi_or_mc = 'micro' + elif line.split()[1] == '(macro)': + mi_or_mc = 'macro' + else: + mi_or_mc = '' + if mi_or_mc != '': + report[ent_or_rel][mi_or_mc] = {'st_f': float(strt_f.strip()) * 100, + 'st_r': float(strt_r.strip()) * 100, + 'st_p': float(strt_p.strip()) * 100, + 'so_f': float(soft_f.strip()) * 100, + 'so_r': float(soft_r.strip()) * 100, + 'so_p': float(soft_p.strip()) * 100} + + return report diff --git a/eval/evaluate.py b/eval/evaluate.py index 716006b..7033e98 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -1,13 +1,19 @@ +import time + import torch from tqdm import tqdm -from eval.evalRE import write_entity_relations -from eval.evalEV import write_events +from eval.evalEV import evaluate_ev +from eval.evalRE import estimate_perf, estimate_rel +from eval.evalNER import eval_nner +from scripts.pipeline_process import gen_ner_ann_files, gen_rel_ann_files from utils import utils +from utils.utils import _humanized_time -def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params): +def eval(model, eval_dir, result_dir, eval_dataloader, eval_data, params, epoch=0): mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping'] + rel_tp_tr, rel_fp_tr, rel_fn_tr = [], [], [] # store predicted entities ent_preds = [] @@ -17,15 +23,20 @@ def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params fidss, wordss, offsetss, sub_to_wordss, span_indicess = [], [], [], [], [] - # entity and relation output - ent_anns = [] rel_anns = [] + ent_anns = [] # Evaluation phase model.eval() + # nner all_ner_preds, all_ner_golds, all_ner_terms = [], [], [] + total_rel_matched_indices = 0 + total_rel_matched_types = 0 + t_start = time.time() + + is_eval_rel = False is_eval_ev = False for step, batch in enumerate( @@ -34,8 +45,8 @@ def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params eval_data_ids = batch tensors = utils.get_tensors(eval_data_ids, eval_data, params) - nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, _, \ - etypes, _ = tensors + nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, nn_gtruth, nn_l2r, _, \ + nn_truth_ev, nn_ev_idxs, ev_lbls, etypes, _ = tensors fids = [ eval_data["fids"][data_id] for data_id in eval_data_ids[0].tolist() @@ -61,11 +72,34 @@ def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params ] with torch.no_grad(): - ner_out, rel_out, ev_out = model(tensors, params) + if not params['predict']: + ner_out, rel_out, ev_out, loss = model(tensors, epoch) + else: + ner_out, rel_out, ev_out, loss = model(tensors) ner_preds = ner_out['preds'] - ner_terms = ner_out['terms'] + if not params['predict']: # Debug only + # Case train REL only + if params['skip_ner'] and params['rel_epoch'] >= (params['epoch'] - 1) and params['use_gold_ner']: + ner_terms = ner_out['gold_terms'] + ner_preds = ner_out['golds'] + # Case train EV only + elif params['skip_ner'] and params['skip_rel'] and params['use_gold_ner'] \ + and params['use_gold_rel']: + ner_terms = ner_out['gold_terms'] + ner_preds = ner_out['golds'] + else: + ner_terms = ner_out['terms'] + else: + if params['gold_eval'] or params['pipelines']: + if params['pipelines'] and params['pipe_flag'] == 0: + ner_terms = ner_out['terms'] + else: + ner_terms = ner_out['gold_terms'] + ner_preds = ner_out['golds'] + else: + ner_terms = ner_out['terms'] all_ner_terms.append(ner_terms) @@ -102,35 +136,87 @@ def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params ) all_ner_preds.append(pred_entities) - # entity prediction - ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['preds'], 'words': words, - 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, - 'ner_terms': ner_terms} - ent_anns.append(ent_ann) - fidss.append(fids) + if params['predict']: + if params['gold_eval'] or params['pipelines']: + if params['pipelines'] and params['pipe_flag'] == 0: + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['preds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + else: + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['golds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + else: + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['preds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + else: + # Case only train REL + if params['skip_ner'] and params['rel_epoch'] >= (params['epoch'] - 1) and params['use_gold_ner']: + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['golds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + # Case only train EV + elif params['skip_ner'] and params['skip_rel'] and params['use_gold_rel']: + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['golds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + else: + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['preds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + + ent_anns.append(ent_ann) wordss.append(words) offsetss.append(offsets) sub_to_wordss.append(sub_to_words) - # relation prediction if rel_out != None: - pairs_idx = rel_out['pairs_idx'] - rel_pred = rel_out['preds'] + rel_tp_tr.append(rel_out['true_pos'].tolist()) + rel_fp_tr.append(rel_out['false_pos'].tolist()) + rel_fn_tr.append(rel_out['false_neg'].tolist()) + total_rel_matched_indices += rel_out['no_matched_rel']['no_rel_matched_indices'] + total_rel_matched_types += rel_out['no_matched_rel']['no_rel_matched_types'] + + if params['predict']: + if params['gold_eval'] or params['pipelines']: + if params['pipelines'] and params['pipe_flag'] != 2: + pairs_idx = rel_out['pairs_idx'] + rel_pred = rel_out['preds'] + else: + pairs_idx = rel_out['l2r'] + rel_pred = rel_out['truth'] + else: + pairs_idx = rel_out['pairs_idx'] + rel_pred = rel_out['preds'] + else: + # Case only train REL + if params['skip_ner'] and params['rel_epoch'] >= (params['epoch'] - 1) \ + and params['use_gold_ner']: + pairs_idx = rel_out['l2r'] + rel_pred = rel_out['preds'] + # Case only train EV + elif params['skip_ner'] and params['skip_rel'] and params['use_gold_rel']: + pairs_idx = rel_out['l2r'] + rel_pred = rel_out['truth'] + else: + pairs_idx = rel_out['pairs_idx'] + rel_pred = rel_out['preds'] rel_ann = {'pairs_idx': pairs_idx, 'rel_preds': rel_pred} rel_anns.append(rel_ann) + is_eval_rel = True else: rel_anns.append({}) - # event prediction if ev_out != None: # add predicted entity ent_preds.append(ner_out["nner_preds"]) # add predicted events - ev_preds.append(ev_out) + ev_preds.append(ev_out['output']) span_indicess.append( [ @@ -148,24 +234,273 @@ def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params # Clear GPU unused RAM: if params['gpu'] >= 0: torch.cuda.empty_cache() - # write entity and relation prediction - _ = write_entity_relations( - result_dir=result_dir, - fidss=fidss, - ent_anns=ent_anns, - rel_anns=rel_anns, - params=params - ) + if params['predict'] and params['pipelines']: + if params['pipe_flag'] == 0: + gen_ner_ann_files(fidss, ent_anns, params) + return + elif params['pipe_flag'] == 1: + gen_rel_ann_files(fidss, ent_anns, rel_anns, params) + return + + # Do estimations here + labels = params["mappings"]["nn_mapping"]["trigger_labels"] + if params["ner_predict_all"]: + labels = params["mappings"]["nn_mapping"]["full_labels"] + + ner_res, ner_score = eval_nner(all_ner_preds, all_ner_golds, labels) + scores = estimate_rel(ref_dir=eval_dir, + result_dir=result_dir, + fids=fidss, + ent_anns=ent_anns, + rel_anns=rel_anns, + params=params) + if is_eval_rel: + tr_scores = estimate_perf(rel_tp_tr, rel_fp_tr, rel_fn_tr, params) + else: + tr_scores = {'micro_p': 0, 'micro_r': 0, 'micro_f': 0} if is_eval_ev > 0: - write_events(fids=fidss, - all_ent_preds=ent_preds, - all_words=wordss, - all_offsets=offsetss, - all_span_terms=all_ner_terms, - all_span_indices=span_indicess, - all_sub_to_words=sub_to_wordss, - all_ev_preds=ev_preds, - g_entity_ids_=g_entity_ids_, - params=params, - result_dir=result_dir) + ev_scores = evaluate_ev(fids=fidss, + all_ent_preds=ent_preds, + all_words=wordss, + all_offsets=offsetss, + all_span_terms=all_ner_terms, + all_span_indices=span_indicess, + all_sub_to_words=sub_to_wordss, + all_ev_preds=ev_preds, + params=params, + gold_dir=eval_dir, + result_dir=result_dir) + else: + ev_scores = {} + + # Print estimation scores here + if not params['predict'] or (params['predict'] and not params['gold_eval']): + print() + print('-----OUR EVALUATIONS (NOT RECOMMEND)-----') + print() + print(ner_res) + print() + print( + "ENT: P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} ".format(ner_score[-1][1], ner_score[-1][2], + ner_score[-1][3]), end="", + ) + print() + print( + "REL: P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} ".format( + tr_scores["micro_p"] * 100, + tr_scores["micro_r"] * 100, + tr_scores["micro_f"] * 100, + ), + end="", + ) + print() + print('Total matched indice relations', total_rel_matched_indices) + print('Total matched type relations', total_rel_matched_types) + + print() + print('-----EVALUATING BY N2C2 SCRIPT (FOR ENT & REL)-----') + print() + print('STRICT_MATCHING:') + print_scores('NER', scores['NER'], 'st') + print() + print('SOFT_MATCHING:') + print_scores('NER', scores['NER'], 'so') + if is_eval_rel: + print() + print('STRICT_MATCHING:') + print_scores('REL', scores['REL'], 'st') + print() + print('SOFT_MATCHING:') + print_scores('REL', scores['REL'], 'so') + else: + if params['skip_rel']: + print('Not evaluate REL') + else: + print('No relation') + print() + print('-----EVALUATING BY SCRIPT (FOR EV)-----') + print() + if len(ev_scores) > 0: + sub_p, sub_r, sub_f = ev_scores['sub_scores'][0], ev_scores['sub_scores'][1], ev_scores['sub_scores'][2] + mod_p, mod_r, mod_f = ev_scores['mod_scores'][0], ev_scores['mod_scores'][1], ev_scores['mod_scores'][2] + tot_p, tot_r, tot_f = ev_scores['tot_scores'][0], ev_scores['tot_scores'][1], ev_scores['tot_scores'][2] + print('SUB : P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} '.format(sub_p, sub_r, sub_f), end="") + print() + print('MOD : P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} '.format(mod_p, mod_r, mod_f), end="") + print() + print('TOT : P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} '.format(tot_p, tot_r, tot_f), end="") + print() + else: + print('No event/Not evaluate EV/error when evaluating by CG script') + print() + print() + print() + t_end = time.time() + print('Elapsed time: {}'.format(_humanized_time(t_end - t_start))) + print() + + # Do saving models + if not params['predict']: + # ! ========== KHOA WAS HERE ========== + ner_f1score = ner_score[-1][3] + + # ner_fscore = ner_f1score * 100 + # get the best score by n2c2 instead + ner_fscore = scores['NER']['micro']['st_f'] + + if is_eval_rel: + rel_fscore = scores['REL']['micro']['st_f'] + else: + rel_fscore = 0 + + if len(ev_scores) > 0: + ev_fscore = ev_scores['tot_scores'][2] + else: + ev_fscore = 0 + + if params['ner_epoch'] >= (params['epoch'] - 1): + best_score = ner_fscore + elif params['rel_epoch'] >= (params['epoch'] - 1): + best_score = rel_fscore + else: + best_score = ev_fscore + # Save models: + if params['save_ner']: + ner_model_path = params['ner_model_dir'] + utils.handle_checkpoints( + model=model.NER_layer, + checkpoint_dir=ner_model_path, + params={ + "filename": "ner_base", + "epoch": epoch, + "fscore": ner_fscore, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + + if params['save_rel']: + rel_model_path = params['rel_model_dir'] + utils.handle_checkpoints( + model=model.REL_layer, + checkpoint_dir=rel_model_path, + params={ + "filename": "rel_base", + "epoch": epoch, + "fscore": rel_fscore, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + if params['save_model_pipeline']: + ner_model_path = params['ner_model_dir'] + utils.handle_checkpoints( + model=model.NER_layer, + checkpoint_dir=ner_model_path, + params={ + "filename": "rel_base", + "epoch": epoch, + "fscore": rel_fscore, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + if params['save_ev']: + ev_model_path = params['ev_model_dir'] + utils.handle_checkpoints( + model=model.EV_layer, + checkpoint_dir=ev_model_path, + params={ + "filename": "ev_base", + "epoch": epoch, + "fscore": ev_fscore, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + if params['save_model_pipeline']: + ner_model_path = params['ner_model_dir'] + rel_model_path = params['rel_model_dir'] + utils.handle_checkpoints( + model=model.NER_layer, + checkpoint_dir=ner_model_path, + params={ + "filename": "ev_base", + "epoch": epoch, + "fscore": ev_fscore, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + utils.handle_checkpoints( + model=model.REL_layer, + checkpoint_dir=rel_model_path, + params={ + "filename": "ev_base", + "epoch": epoch, + "fscore": ev_fscore, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + + if params['save_all_models']: + deepee_model_path = params['joint_model_dir'] + utils.handle_checkpoints( + model=model, + checkpoint_dir=deepee_model_path, + params={ + "filename": "deepee_base", + "epoch": epoch, + "fscore": best_score, + "ner_fscore": ner_fscore, + "rel_fscore": rel_fscore, + "ev_fscore": ev_fscore, + 'device': params['device'] + }, + filter_func=utils.save_best_fscore, + num_saved=1 + ) + print("Saved all models") + # ! =================================== + + if len(ev_scores) > 0: + return ner_score, is_eval_rel, tr_scores, scores, {'p': ev_scores['tot_scores'][0], + 'r': ev_scores['tot_scores'][1], + 'f': ev_scores['tot_scores'][2]} + else: + return ner_score, is_eval_rel, tr_scores, scores, ev_scores + + +def print_scores(k, v, stoso): + print( + k + "(MICRO): P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} , (MACRO): P/R/F1 = {:.02f}\t{:.02f}\t{:.02f} ".format( + v['micro'][stoso + '_p'], v['micro'][stoso + '_r'], v['micro'][stoso + '_f'], + v['macro'][stoso + '_p'], v['macro'][stoso + '_r'], v['macro'][stoso + '_f']), end="", + ) + print() diff --git a/eval/scripts/n2c2.py b/eval/scripts/n2c2.py new file mode 100644 index 0000000..3865281 --- /dev/null +++ b/eval/scripts/n2c2.py @@ -0,0 +1,1511 @@ +#!/usr/local/bin/python + +"""Inter-annotator agreement calculator.""" + +""" +To run this file, please use: + +python + +e.g.: python gold_annotations system_annotations + +Please note that you must use Python 3 to get the correct results with this script + + +""" + +import argparse +import glob +import os +import logging +from collections import defaultdict +from xml.etree import cElementTree + +logger = logging.getLogger(__name__) + + +class ClinicalCriteria(object): + """Criteria in the Track 1 documents.""" + + def __init__(self, tid, value): + """Init.""" + self.tid = tid.strip().upper() + self.ttype = self.tid + self.value = value.lower().strip() + + def equals(self, other, mode='strict'): + """Return whether the current criteria is equal to the one provided.""" + if other.tid == self.tid and other.value == self.value: + return True + return False + + +class ClinicalConcept(object): + """Named Entity Tag class.""" + + def __init__(self, tid, start, end, ttype, text=''): + """Init.""" + self.tid = str(tid).strip() + self.start = int(start) + self.end = int(end) + self.text = str(text).strip() + self.ttype = str(ttype).strip() + + def span_matches(self, other, mode='strict'): + """Return whether the current tag overlaps with the one provided.""" + assert mode in ('strict', 'lenient') + if mode == 'strict': + if self.start == other.start and self.end == other.end: + return True + else: # lenient + if (self.end > other.start and self.start < other.end) or \ + (self.start < other.end and other.start < self.end): + return True + return False + + def equals(self, other, mode='strict'): + """Return whether the current tag is equal to the one provided.""" + assert mode in ('strict', 'lenient') + return other.ttype == self.ttype and self.span_matches(other, mode) + + def __str__(self): + """String representation.""" + return '{}\t{}\t({}:{})'.format(self.ttype, self.text, self.start, self.end) + + +class Relation(object): + """Relation class.""" + + def __init__(self, rid, arg1, arg2, rtype): + """Init.""" + assert isinstance(arg1, ClinicalConcept) + assert isinstance(arg2, ClinicalConcept) + self.rid = str(rid).strip() + self.arg1 = arg1 + self.arg2 = arg2 + self.rtype = str(rtype).strip() + + def equals(self, other, mode='strict'): + """Return whether the current tag is equal to the one provided.""" + assert mode in ('strict', 'lenient') + if self.arg1.equals(other.arg1, mode) and \ + self.arg2.equals(other.arg2, mode) and \ + self.rtype == other.rtype: + return True + return False + + def __str__(self): + """String representation.""" + return '{} ({}->{})'.format(self.rtype, self.arg1.ttype, + self.arg2.ttype) + + +class RecordTrack1(object): + """Record for Track 2 class.""" + + def __init__(self, file_path): + self.path = os.path.abspath(file_path) + self.basename = os.path.basename(self.path) + self.annotations = self._get_annotations() + self.text = None + + @property + def tags(self): + return self.annotations['tags'] + + def _get_annotations(self): + """Return a dictionary with all the annotations in the .ann file.""" + annotations = defaultdict(dict) + annotation_file = cElementTree.parse(self.path) + for tag in annotation_file.findall('.//TAGS/*'): + criterion = ClinicalCriteria(tag.tag.upper(), tag.attrib['met']) + annotations['tags'][tag.tag.upper()] = criterion + if tag.attrib['met'] not in ('met', 'not met'): + assert '{}: Unexpected value ("{}") for the {} tag!'.format( + self.path, criterion.value, criterion.ttype) + return annotations + + +class RecordTrack2(object): + """Record for Track 2 class.""" + + def __init__(self, file_path): + """Initialize.""" + self.path = os.path.abspath(file_path) + self.basename = os.path.basename(self.path) + self.annotations = self._get_annotations() + # self.text = self._get_text() + + @property + def tags(self): + return self.annotations['tags'] + + @property + def relations(self): + return self.annotations['relations'] + + def _get_annotations(self): + """Return a dictionary with all the annotations in the .ann file.""" + annotations = defaultdict(dict) + with open(self.path) as annotation_file: + lines = annotation_file.readlines() + for line_num, line in enumerate(lines): + if line.strip().startswith('T'): + try: + tag_id, tag_m, tag_text = line.strip().split('\t') + except ValueError: + print(self.path, line) + if len(tag_m.split(' ')) == 3: + tag_type, tag_start, tag_end = tag_m.split(' ') + elif len(tag_m.split(' ')) == 4: + tag_type, tag_start, _, tag_end = tag_m.split(' ') + elif len(tag_m.split(' ')) == 5: + tag_type, tag_start, _, _, tag_end = tag_m.split(' ') + else: + print(self.path) + print(line) + tag_start, tag_end = int(tag_start), int(tag_end) + annotations['tags'][tag_id] = ClinicalConcept(tag_id, + tag_start, + tag_end, + tag_type, + tag_text) + for line_num, line in enumerate(lines): + if line.strip().startswith('R'): + rel_id, rel_m = line.strip().split('\t') + rel_type, rel_arg1, rel_arg2 = rel_m.split(' ') + rel_arg1 = rel_arg1.split(':')[1] + rel_arg2 = rel_arg2.split(':')[1] + try: + arg1 = annotations['tags'][rel_arg1] + arg2 = annotations['tags'][rel_arg2] + annotations['relations'][rel_id] = Relation(rel_id, arg1, + arg2, rel_type) + except KeyError as err: + logger.info(err) + return annotations + + def _get_text(self): + """Return the text in the corresponding txt file.""" + path = self.path.replace('.ann', '.txt') + with open(path) as text_file: + text = text_file.read() + return text + + def search_by_id(self, key): + """Search by id among both tags and relations.""" + try: + return self.annotations['tags'][key] + except KeyError(): + try: + return self.annotations['relations'][key] + except KeyError(): + return None + + +class Measures(object): + """Abstract methods and var to evaluate.""" + + def __init__(self, tp=0, tn=0, fp=0, fn=0): + """Initizialize.""" + assert type(tp) == int + assert type(tn) == int + assert type(fp) == int + assert type(fn) == int + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn + + def precision(self): + """Compute Precision score.""" + try: + return self.tp / (self.tp + self.fp) + except ZeroDivisionError: + return 0.0 + + def recall(self): + """Compute Recall score.""" + try: + return self.tp / (self.tp + self.fn) + except ZeroDivisionError: + return 0.0 + + def f_score(self, beta=1): + """Compute F1-measure score.""" + assert beta > 0. + try: + num = (1 + beta ** 2) * (self.precision() * self.recall()) + den = beta ** 2 * (self.precision() + self.recall()) + return num / den + except ZeroDivisionError: + return 0.0 + + def f1(self): + """Compute the F1-score (beta=1).""" + return self.f_score(beta=1) + + def specificity(self): + """Compute Specificity score.""" + try: + return self.tn / (self.fp + self.tn) + except ZeroDivisionError: + return 0.0 + + def sensitivity(self): + """Compute Sensitivity score.""" + return self.recall() + + def auc(self): + """Compute AUC score.""" + return (self.sensitivity() + self.specificity()) / 2 + + +class SingleEvaluator(object): + """Evaluate two single files.""" + + def __init__(self, doc1, doc2, track, mode='strict', key=None, verbose=False, exclude_tags=()): + """Initialize.""" + assert isinstance(doc1, RecordTrack2) or isinstance(doc1, RecordTrack1) + assert isinstance(doc2, RecordTrack2) or isinstance(doc2, RecordTrack1) + assert mode in ('strict', 'lenient') + assert doc1.basename == doc2.basename + self.scores = {'tags': {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}, + 'relations': {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}} + self.doc1 = doc1 + self.doc2 = doc2 + if key: + gol = [t for t in doc1.tags.values() if t.ttype == key and t.ttype not in exclude_tags] + sys = [t for t in doc2.tags.values() if t.ttype == key and t.ttype not in exclude_tags] + sys_check = [t for t in doc2.tags.values() if t.ttype == key and t.ttype not in exclude_tags] + else: + gol = [t for t in doc1.tags.values() if t.ttype not in exclude_tags] + sys = [t for t in doc2.tags.values() if t.ttype not in exclude_tags] + sys_check = [t for t in doc2.tags.values() if t.ttype not in exclude_tags] + + # pare down matches -- if multiple system tags overlap with only one + # gold standard tag, only keep one sys tag + gol_matched = [] + for s in sys: + for g in gol: + if (g.equals(s, mode)): + if g not in gol_matched: + gol_matched.append(g) + else: + if s in sys_check: + sys_check.remove(s) + + sys = sys_check + # now evaluate + self.scores['tags']['tp'] = len({s.tid for s in sys for g in gol if g.equals(s, mode)}) + self.scores['tags']['fp'] = len({s.tid for s in sys}) - self.scores['tags']['tp'] + self.scores['tags']['fn'] = len({g.tid for g in gol}) - self.scores['tags']['tp'] + self.scores['tags']['tn'] = 0 + + if verbose and track == 2: + tps = {s for s in sys for g in gol if g.equals(s, mode)} + fps = set(sys) - tps + fns = set() + for g in gol: + if not len([s for s in sys if s.equals(g, mode)]): + fns.add(g) + for e in fps: + print('FP: ' + str(e)) + for e in fns: + print('FN:' + str(e)) + if track == 2: + if key: + gol = [r for r in doc1.relations.values() if r.rtype == key] + sys = [r for r in doc2.relations.values() if r.rtype == key] + sys_check = [r for r in doc2.relations.values() if r.rtype == key] + else: + gol = [r for r in doc1.relations.values()] + sys = [r for r in doc2.relations.values()] + sys_check = [r for r in doc2.relations.values()] + + # pare down matches -- if multiple system tags overlap with only one + # gold standard tag, only keep one sys tag + gol_matched = [] + for s in sys: + for g in gol: + if (g.equals(s, mode)): + if g not in gol_matched: + gol_matched.append(g) + else: + if s in sys_check: + sys_check.remove(s) + sys = sys_check + # now evaluate + self.scores['relations']['tp'] = len({s.rid for s in sys for g in gol if g.equals(s, mode)}) + self.scores['relations']['fp'] = len({s.rid for s in sys}) - self.scores['relations']['tp'] + self.scores['relations']['fn'] = len({g.rid for g in gol}) - self.scores['relations']['tp'] + self.scores['relations']['tn'] = 0 + if verbose: + tps = {s for s in sys for g in gol if g.equals(s, mode)} + fps = set(sys) - tps + fns = set() + for g in gol: + if not len([s for s in sys if s.equals(g, mode)]): + fns.add(g) + for e in fps: + print('FP: ' + str(e)) + for e in fns: + print('FN:' + str(e)) + + +class MultipleEvaluator(object): + """Evaluate two sets of files.""" + + def __init__(self, corpora, tag_type=None, mode='strict', + verbose=False): + """Initialize.""" + assert isinstance(corpora, Corpora) + assert mode in ('strict', 'lenient') + self.scores = None + if corpora.track == 1: + self.track1(corpora) + else: + self.track2(corpora, tag_type, mode, verbose) + + def track1(self, corpora): + """Compute measures for Track 1.""" + self.tags = ('ABDOMINAL', 'ADVANCED-CAD', 'ALCOHOL-ABUSE', + 'ASP-FOR-MI', 'CREATININE', 'DIETSUPP-2MOS', + 'DRUG-ABUSE', 'ENGLISH', 'HBA1C', 'KETO-1YR', + 'MAJOR-DIABETES', 'MAKES-DECISIONS', 'MI-6MOS') + self.scores = defaultdict(dict) + metrics = ('p', 'r', 'f1', 'specificity', 'auc') + values = ('met', 'not met') + self.values = {'met': {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}, + 'not met': {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}} + + def evaluation(corpora, value, scores): + predictions = defaultdict(list) + for g, s in corpora.docs: + for tag in self.tags: + predictions[tag].append( + (g.tags[tag].value == value, s.tags[tag].value == value)) + for tag in self.tags: + # accumulate for micro overall measure + self.values[value]['tp'] += predictions[tag].count((True, True)) + self.values[value]['fp'] += predictions[tag].count((False, True)) + self.values[value]['tn'] += predictions[tag].count((False, False)) + self.values[value]['fn'] += predictions[tag].count((True, False)) + + # compute per-tag measures + measures = Measures(tp=predictions[tag].count((True, True)), + fp=predictions[tag].count((False, True)), + tn=predictions[tag].count((False, False)), + fn=predictions[tag].count((True, False))) + scores[(tag, value, 'p')] = measures.precision() + scores[(tag, value, 'r')] = measures.recall() + scores[(tag, value, 'f1')] = measures.f1() + scores[(tag, value, 'specificity')] = measures.specificity() + scores[(tag, value, 'auc')] = measures.auc() + return scores + + self.scores = evaluation(corpora, 'met', self.scores) + self.scores = evaluation(corpora, 'not met', self.scores) + + for measure in metrics: + for value in values: + self.scores[('macro', value, measure)] = sum( + [self.scores[(t, value, measure)] for t in self.tags]) / len(self.tags) + + def track2(self, corpora, tag_type=None, mode='strict', verbose=False): + """Compute measures for Track 2.""" + self.scores = {'tags': {'tp': 0, + 'fp': 0, + 'fn': 0, + 'tn': 0, + 'micro': {'precision': 0, + 'recall': 0, + 'f1': 0}, + 'macro': {'precision': 0, + 'recall': 0, + 'f1': 0}}, + 'relations': {'tp': 0, + 'fp': 0, + 'fn': 0, + 'tn': 0, + 'micro': {'precision': 0, + 'recall': 0, + 'f1': 0}, + 'macro': {'precision': 0, + 'recall': 0, + 'f1': 0}}} + + # -----------------------------------------CG-------------------------------------------- + if corpora.corpus_type == 'cg': + self.tags = ( + 'Gene_expression', 'Mutation', 'Regulation', 'Development', 'Negative_regulation', 'Cell_proliferation', + 'Transcription', 'Glycosylation', 'Positive_regulation', 'Binding', 'Localization', 'Planned_process', + 'Metastasis', 'Death', 'Blood_vessel_development', 'Breakdown', 'Growth', 'Cell_transformation', + 'Carcinogenesis', 'Cell_differentiation', 'Cell_death', 'Cell_division', 'Infection', 'Pathway', + 'Dephosphorylation', 'Synthesis', 'Catabolism', 'Protein_processing', 'Remodeling', 'Metabolism', + 'Dissociation', 'Phosphorylation', 'Glycolysis', 'Translation', 'DNA_methylation', 'Reproduction', + 'Acetylation', 'Ubiquitination', 'Amino_acid_catabolism', 'DNA_demethylation', 'Gene_or_gene_product', + 'Cancer', 'Cell', 'Organism', 'DNA_domain_or_region', 'Simple_chemical', 'Multi-tissue_structure', + 'Organ', 'Organism_subdivision', 'Tissue', 'Immaterial_anatomical_entity', 'Organism_substance', + 'Protein_domain_or_region', 'Cellular_component', 'Pathological_formation', 'Amino_acid', + 'Anatomical_system', 'Developing_anatomical_structure') + self.exclude_tags = () + self.relations = ( + 'CSite', 'FromLoc', 'Site', 'ToLoc', 'Participant', 'AtLoc', 'Instrument', 'Cause', 'Theme') + elif corpora.corpus_type == 'cg_tr': + self.tags = ( + 'Gene_expression', 'Mutation', 'Regulation', 'Development', 'Negative_regulation', 'Cell_proliferation', + 'Transcription', 'Glycosylation', 'Positive_regulation', 'Binding', 'Localization', 'Planned_process', + 'Metastasis', 'Death', 'Blood_vessel_development', 'Breakdown', 'Growth', 'Cell_transformation', + 'Carcinogenesis', 'Cell_differentiation', 'Cell_death', 'Cell_division', 'Infection', 'Pathway', + 'Dephosphorylation', 'Synthesis', 'Catabolism', 'Protein_processing', 'Remodeling', 'Metabolism', + 'Dissociation', 'Phosphorylation', 'Glycolysis', 'Translation', 'DNA_methylation', 'Reproduction', + 'Acetylation', 'Ubiquitination', 'Amino_acid_catabolism', 'DNA_demethylation', 'Gene_or_gene_product', + 'Cancer', 'Cell', 'Organism', 'DNA_domain_or_region', 'Simple_chemical', 'Multi-tissue_structure', + 'Organ', 'Organism_subdivision', 'Tissue', 'Immaterial_anatomical_entity', 'Organism_substance', + 'Protein_domain_or_region', 'Cellular_component', 'Pathological_formation', 'Amino_acid', + 'Anatomical_system', 'Developing_anatomical_structure') + self.exclude_tags = ( + 'Immaterial_anatomical_entity', 'Cancer', 'Multi-tissue_structure', 'Anatomical_system', + 'Pathological_formation', 'Tissue', 'Gene_or_gene_product', 'Cell', 'Protein_domain_or_region', + 'Developing_anatomical_structure', 'Organism', 'Organ', 'Simple_chemical', 'Organism_subdivision', + 'Amino_acid', 'Organism_substance', 'DNA_domain_or_region', 'Cellular_component' + ) + self.relations = ( + 'CSite', 'FromLoc', 'Site', 'ToLoc', 'Participant', 'AtLoc', 'Instrument', 'Cause', 'Theme' + ) + elif corpora.corpus_type == 'cg_en': + self.tags = ( + 'Gene_expression', 'Mutation', 'Regulation', 'Development', 'Negative_regulation', 'Cell_proliferation', + 'Transcription', 'Glycosylation', 'Positive_regulation', 'Binding', 'Localization', 'Planned_process', + 'Metastasis', 'Death', 'Blood_vessel_development', 'Breakdown', 'Growth', 'Cell_transformation', + 'Carcinogenesis', 'Cell_differentiation', 'Cell_death', 'Cell_division', 'Infection', 'Pathway', + 'Dephosphorylation', 'Synthesis', 'Catabolism', 'Protein_processing', 'Remodeling', 'Metabolism', + 'Dissociation', 'Phosphorylation', 'Glycolysis', 'Translation', 'DNA_methylation', 'Reproduction', + 'Acetylation', 'Ubiquitination', 'Amino_acid_catabolism', 'DNA_demethylation', 'Gene_or_gene_product', + 'Cancer', 'Cell', 'Organism', 'DNA_domain_or_region', 'Simple_chemical', 'Multi-tissue_structure', + 'Organ', 'Organism_subdivision', 'Tissue', 'Immaterial_anatomical_entity', 'Organism_substance', + 'Protein_domain_or_region', 'Cellular_component', 'Pathological_formation', 'Amino_acid', + 'Anatomical_system', 'Developing_anatomical_structure') + self.exclude_tags = ( + 'Positive_regulation', 'Negative_regulation', 'Regulation', 'Planned_process', 'Gene_expression', + 'Localization', 'Blood_vessel_development', 'Metastasis', 'Development', 'Cell_proliferation', + 'Cell_death', 'Binding', 'Pathway', 'Mutation', 'Cell_transformation', 'Carcinogenesis', + 'Growth', 'Death', 'Transcription', 'Breakdown', 'Cell_differentiation', 'Phosphorylation', + 'Metabolism', 'Glycolysis', 'Synthesis', 'Remodeling', 'DNA_methylation', 'Catabolism', + 'Infection', 'Protein_processing', 'Translation', 'Glycosylation', 'Dephosphorylation', + 'Acetylation', 'Dissociation', 'Cell_division', 'Amino_acid_catabolism', 'Reproduction', + 'Ubiquitination', 'DNA_demethylation') + self.relations = ( + 'CSite', 'FromLoc', 'Site', 'ToLoc', 'Participant', 'AtLoc', 'Instrument', 'Cause', 'Theme') + + # -----------------------------------------ACE-------------------------------------------- + elif corpora.corpus_type == 'ace': + self.tags = ( + 'Die', 'Injure', 'Attack', 'Transport', 'Start-Position', 'Arrest-Jail', 'Meet', 'Transfer-Money', + 'Sue', 'Charge-Indict', 'Sentence', 'Convict', 'End-Position', 'Transfer-Ownership', 'Demonstrate', + 'Execute', 'Appeal', 'Phone-Write', 'Elect', 'Trial-Hearing', 'Release-Parole', 'Acquit', 'Fine', + 'Start-Org', 'End-Org', 'Marry', 'Declare-Bankruptcy', 'Be-Born', 'Divorce', 'Extradite', 'Pardon', + 'Nominate', 'Merge-Org', 'LOC', 'FAC', 'PER', 'ORG', 'GPE', 'Time', 'WEA', 'VEH', 'Money', 'Crime', + 'Percent', 'Job-Title') + self.exclude_tags = () + self.relations = ( + 'Price', 'Time-At-End', 'Time-At-Beginning', 'Time-Ending', 'Time-Before', 'Time-After', 'Prosecutor', + 'Beneficiary', 'Seller', 'Time-Starting', 'Time-Holds', 'Plaintiff', 'Sentence', 'Vehicle', 'Money', + 'Buyer', 'Adjudicator', 'Org', 'Giver', 'Position', 'Recipient', 'Origin', 'Crime', 'Instrument', + 'Defendant', 'Agent', 'Target', 'Destination', 'Attacker', 'Victim', 'Artifact', 'Person', + 'Time-Within', 'Entity', 'Place') + + elif corpora.corpus_type == 'ace_tr': + self.tags = ( + 'Die', 'Injure', 'Attack', 'Transport', 'Start-Position', 'Arrest-Jail', 'Meet', 'Transfer-Money', + 'Sue', 'Charge-Indict', 'Sentence', 'Convict', 'End-Position', 'Transfer-Ownership', 'Demonstrate', + 'Execute', 'Appeal', 'Phone-Write', 'Elect', 'Trial-Hearing', 'Release-Parole', 'Acquit', 'Fine', + 'Start-Org', 'End-Org', 'Marry', 'Declare-Bankruptcy', 'Be-Born', 'Divorce', 'Extradite', 'Pardon', + 'Nominate', 'Merge-Org', 'LOC', 'FAC', 'PER', 'ORG', 'GPE', 'Time', 'WEA', 'VEH', 'Money', 'Crime', + 'Percent', 'Job-Title') + self.exclude_tags = ( + 'ORG', 'VEH', 'Time', 'GPE', 'FAC', 'Money', 'LOC', 'PER', 'WEA', 'Job-Title', 'Percent', 'Crime' + ) + self.relations = ( + 'Price', 'Time-At-End', 'Time-At-Beginning', 'Time-Ending', 'Time-Before', 'Time-After', 'Prosecutor', + 'Beneficiary', 'Seller', 'Time-Starting', 'Time-Holds', 'Plaintiff', 'Sentence', 'Vehicle', 'Money', + 'Buyer', 'Adjudicator', 'Org', 'Giver', 'Position', 'Recipient', 'Origin', 'Crime', 'Instrument', + 'Defendant', 'Agent', 'Target', 'Destination', 'Attacker', 'Victim', 'Artifact', 'Person', + 'Time-Within', 'Entity', 'Place') + + # -----------------------------------------GE13-------------------------------------------- + elif corpora.corpus_type == 'ge13': + self.tags = ( + "Anaphora", + "Entity", + "Protein", + "Acetylation", + "Binding", + "Deacetylation", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Protein_modification", + "Regulation", + "Transcription", + "Ubiquitination", + ) + self.exclude_tags = ( + ) + self.relations = ( + "CSite", + "Cause", + "Site", + "Theme", + "ToLoc" + ) + elif corpora.corpus_type == 'ge13_tr': + self.tags = ( + "Anaphora", + "Entity", + "Protein", + "Acetylation", + "Binding", + "Deacetylation", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Protein_modification", + "Regulation", + "Transcription", + "Ubiquitination", + ) + self.exclude_tags = ( + "Anaphora", + "Entity", + "Protein", + ) + self.relations = ( + "CSite", + "Cause", + "Site", + "Theme", + "ToLoc", + ) + elif corpora.corpus_type == 'ge13_en': + self.tags = ( + "Anaphora", + "Entity", + "Protein", + "Acetylation", + "Binding", + "Deacetylation", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Protein_modification", + "Regulation", + "Transcription", + "Ubiquitination", + ) + self.exclude_tags = ( + "Acetylation", + "Binding", + "Deacetylation", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Protein_modification", + "Regulation", + "Transcription", + "Ubiquitination", + ) + self.relations = ( + "CSite", + "Cause", + "Site", + "Theme", + "ToLoc", + ) + # -----------------------------------------GE11-------------------------------------------- + elif corpora.corpus_type == 'ge11': + self.tags = ( + "Entity", + "Protein", + "Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Regulation", + "Transcription", + ) + self.exclude_tags = ( + ) + self.relations = ( + "AtLoc", + "CSite", + "Cause", + "Site", + "Theme", + "ToLoc", + ) + elif corpora.corpus_type == 'ge11_tr': + self.tags = ( + "Entity", + "Protein", + "Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Regulation", + "Transcription", + ) + self.exclude_tags = ( + "Entity", + "Protein" + ) + self.relations = ( + "AtLoc", + "CSite", + "Cause", + "Site", + "Theme", + "ToLoc", + ) + + elif corpora.corpus_type == 'ge11_en': + self.tags = ( + "Entity", + "Protein", + "Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Regulation", + "Transcription", + ) + self.exclude_tags = ("Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Protein_catabolism", + "Regulation", + "Transcription",) + self.relations = ( + "AtLoc", + "CSite", + "Cause", + "Site", + "Theme", + "ToLoc", + ) + # -----------------------------------------ID-------------------------------------------- + elif corpora.corpus_type == 'id': + self.tags = ( + "Chemical", + "Entity", + "Organism", + "Protein", + "Regulon-operon", + "Two-component-system", + "Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Process", + "Protein_catabolism", + "Regulation", + "Transcription", + ) + self.exclude_tags = ( + ) + self.relations = ( + "AtLoc", + "CSite", + "Cause", + "Participant", + "Site", + "Theme", + "ToLoc" + ) + elif corpora.corpus_type == 'id_tr': + self.tags = ( + "Chemical", + "Entity", + "Organism", + "Protein", + "Regulon-operon", + "Two-component-system", + "Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Process", + "Protein_catabolism", + "Regulation", + "Transcription", + ) + self.exclude_tags = ( + "Chemical", + "Entity", + "Organism", + "Protein", + "Regulon-operon", + "Two-component-system", + ) + self.relations = ( + "AtLoc", + "CSite", + "Cause", + "Participant", + "Site", + "Theme", + "ToLoc", + ) + elif corpora.corpus_type == 'id_en': + self.tags = ( + "Binding", + "Gene_expression", + "Localization", + "Negative_regulation", + "Phosphorylation", + "Positive_regulation", + "Process", + "Protein_catabolism", + "Regulation", + "Transcription", + ) + self.exclude_tags = () + self.relations = ( + "AtLoc", + "CSite", + "Cause", + "Participant", + "Site", + "Theme", + "ToLoc", + ) + + # -----------------------------------------MLEE-------------------------------------------- + elif corpora.corpus_type == 'mlee': + self.tags = ( + 'Positive_regulation', 'Blood_vessel_development', 'Negative_regulation', 'Regulation', + 'Planned_process', + 'Localization', 'Development', 'Gene_expression', 'Growth', 'Binding', 'Cell_proliferation', 'Pathway', + 'Death', 'Breakdown', 'Remodeling', 'Catabolism', 'Phosphorylation', 'Transcription', 'Synthesis', + 'DNA_methylation', 'Metabolism', 'Protein_processing', 'Acetylation', 'Translation', + 'Dephosphorylation', + 'Ubiquitination', 'Gene_or_gene_product', 'Cell', 'Drug_or_compound', 'Pathological_formation', + 'Organism', 'Multi-tissue_structure', + 'Tissue', 'Organ', 'Cellular_component', 'Organism_substance', 'DNA_domain_or_region', + 'Organism_subdivision', + 'Protein_domain_or_region', 'Anatomical_system', 'Immaterial_anatomical_entity', + 'Developing_anatomical_structure' + ) + self.exclude_tags = () + self.relations = ( + 'Theme', 'Cause', 'Instrument', 'AtLoc', 'Participant', 'Site', 'ToLoc', 'CSite', 'FromLoc') + elif corpora.corpus_type == 'mlee_tr': + self.tags = ( + 'Positive_regulation', 'Blood_vessel_development', 'Negative_regulation', 'Regulation', + 'Planned_process', + 'Localization', 'Development', 'Gene_expression', 'Growth', 'Binding', 'Cell_proliferation', 'Pathway', + 'Death', 'Breakdown', 'Remodeling', 'Catabolism', 'Phosphorylation', 'Transcription', 'Synthesis', + 'DNA_methylation', 'Metabolism', 'Protein_processing', 'Acetylation', 'Translation', + 'Dephosphorylation', + 'Ubiquitination', 'Gene_or_gene_product', 'Cell', 'Drug_or_compound', 'Pathological_formation', + 'Organism', 'Multi-tissue_structure', + 'Tissue', 'Organ', 'Cellular_component', 'Organism_substance', 'DNA_domain_or_region', + 'Organism_subdivision', + 'Protein_domain_or_region', 'Anatomical_system', 'Immaterial_anatomical_entity', + 'Developing_anatomical_structure' + ) + self.exclude_tags = ( + 'Gene_or_gene_product', 'Cell', 'Drug_or_compound', 'Pathological_formation', 'Organism', + 'Multi-tissue_structure', + 'Tissue', 'Organ', 'Cellular_component', 'Organism_substance', 'DNA_domain_or_region', + 'Organism_subdivision', + 'Protein_domain_or_region', 'Anatomical_system', 'Immaterial_anatomical_entity', + 'Developing_anatomical_structure' + ) + self.relations = ( + 'Theme', 'Cause', 'Instrument', 'AtLoc', 'Participant', 'Site', 'ToLoc', 'CSite', 'FromLoc') + elif corpora.corpus_type == 'mlee_en': + self.tags = ( + 'Positive_regulation', 'Blood_vessel_development', 'Negative_regulation', 'Regulation', + 'Planned_process', + 'Localization', 'Development', 'Gene_expression', 'Growth', 'Binding', 'Cell_proliferation', 'Pathway', + 'Death', 'Breakdown', 'Remodeling', 'Catabolism', 'Phosphorylation', 'Transcription', 'Synthesis', + 'DNA_methylation', 'Metabolism', 'Protein_processing', 'Acetylation', 'Translation', + 'Dephosphorylation', + 'Ubiquitination', 'Gene_or_gene_product', 'Cell', 'Drug_or_compound', 'Pathological_formation', + 'Organism', 'Multi-tissue_structure', + 'Tissue', 'Organ', 'Cellular_component', 'Organism_substance', 'DNA_domain_or_region', + 'Organism_subdivision', + 'Protein_domain_or_region', 'Anatomical_system', 'Immaterial_anatomical_entity', + 'Developing_anatomical_structure' + ) + self.exclude_tags = ( + 'Positive_regulation', 'Blood_vessel_development', 'Negative_regulation', 'Planned_process', + 'Regulation', + 'Localization', 'Gene_expression', 'Development', 'Growth', 'Binding', 'Cell_proliferation', 'Pathway', + 'Death', 'Breakdown', 'Remodeling', 'Phosphorylation', 'Catabolism', 'Transcription', 'Synthesis', + 'DNA_methylation', 'Metabolism', 'Protein_processing', 'Dephosphorylation', 'Reproduction', + 'Acetylation', + 'Translation', 'Cell_division', 'Dissociation', 'Ubiquitination') + self.relations = ( + 'Theme', 'Cause', 'Instrument', 'AtLoc', 'Participant', 'Site', 'ToLoc', 'CSite', 'FromLoc') + + # -----------------------------------------PC-------------------------------------------- + elif corpora.corpus_type == 'pc': + self.tags = ( + 'Positive_regulation', 'Negative_regulation', 'Regulation', 'Binding', 'Pathway', 'Phosphorylation', + 'Gene_expression', 'Activation', 'Transport', 'Conversion', 'Localization', 'Inactivation', + 'Transcription', + 'Dissociation', 'Degradation', 'Ubiquitination', 'Acetylation', 'Dephosphorylation', 'Translation', + 'Methylation', + 'Demethylation', 'Deubiquitination', 'Hydroxylation', 'Deacetylation', + 'Gene_or_gene_product', 'Simple_chemical', 'Complex', 'Cellular_component' + ) + self.exclude_tags = ( + ) + self.relations = ( + 'Theme', 'Cause', 'Participant', 'Site', 'Product', 'ToLoc', 'AtLoc', 'FromLoc' + ) + elif corpora.corpus_type == 'pc_tr': + self.tags = ( + 'Positive_regulation', 'Negative_regulation', 'Regulation', 'Binding', 'Pathway', 'Phosphorylation', + 'Gene_expression', 'Activation', 'Transport', 'Conversion', 'Localization', 'Inactivation', + 'Transcription', + 'Dissociation', 'Degradation', 'Ubiquitination', 'Acetylation', 'Dephosphorylation', 'Translation', + 'Methylation', + 'Demethylation', 'Deubiquitination', 'Hydroxylation', 'Deacetylation' + ) + self.exclude_tags = ( + 'Gene_or_gene_product', 'Simple_chemical', 'Complex', 'Cellular_component' + ) + self.relations = ( + 'Theme', 'Cause', 'Participant', 'Site', 'Product', 'ToLoc', 'AtLoc', 'FromLoc' + ) + elif corpora.corpus_type == 'pc_en': + self.tags = ( + 'Positive_regulation', 'Negative_regulation', 'Regulation', 'Binding', 'Pathway', 'Phosphorylation', + 'Gene_expression', 'Activation', 'Transport', 'Conversion', 'Localization', 'Inactivation', + 'Transcription', + 'Dissociation', 'Degradation', 'Ubiquitination', 'Acetylation', 'Dephosphorylation', 'Translation', + 'Methylation', + 'Demethylation', 'Deubiquitination', 'Hydroxylation', 'Deacetylation', + 'Gene_or_gene_product', 'Simple_chemical', 'Complex', 'Cellular_component' + ) + self.exclude_tags = ( + 'Positive_regulation', 'Negative_regulation', 'Regulation', 'Binding', 'Pathway', 'Phosphorylation', + 'Gene_expression', 'Activation', 'Transport', 'Conversion', 'Localization', 'Inactivation', + 'Transcription', + 'Dissociation', 'Degradation', 'Ubiquitination', 'Acetylation', 'Dephosphorylation', 'Translation', + 'Methylation', + 'Demethylation', 'Deubiquitination', 'Hydroxylation', 'Deacetylation' + ) + self.relations = ( + 'Theme', 'Cause', 'Participant', 'Site', 'Product', 'ToLoc', 'AtLoc', 'FromLoc' + ) + # -----------------------------------------EPI-------------------------------------------- + + elif corpora.corpus_type == 'epi': + self.tags = ( + 'Methylation', 'Glycosylation', 'Acetylation', 'Ubiquitination', 'DNA_methylation', 'Catalysis', + 'Hydroxylation', 'Phosphorylation', 'Deacetylation', 'Deglycosylation', 'DNA_demethylation', + 'Deubiquitination', 'Demethylation', 'Dephosphorylation', 'Dehydroxylation', + 'Protein', 'Entity' + ) + self.exclude_tags = ( + ) + self.relations = ( + 'Theme', 'Site', 'Cause', 'Contextgene', 'Sidechain' + ) + elif corpora.corpus_type == 'epi_tr': + self.tags = ( + 'Methylation', 'Glycosylation', 'Acetylation', 'Ubiquitination', 'DNA_methylation', 'Catalysis', + 'Hydroxylation', 'Phosphorylation', 'Deacetylation', 'Deglycosylation', 'DNA_demethylation', + 'Deubiquitination', 'Demethylation', 'Dephosphorylation', 'Dehydroxylation', + 'Protein', 'Entity' + ) + self.exclude_tags = ( + 'Protein', 'Entity' + ) + self.relations = ( + 'Theme', 'Site', 'Cause', 'Contextgene', 'Sidechain' + ) + elif corpora.corpus_type == 'epi_en': + self.tags = ( + 'Methylation', 'Glycosylation', 'Acetylation', 'Ubiquitination', 'DNA_methylation', 'Catalysis', + 'Hydroxylation', 'Phosphorylation', 'Deacetylation', 'Deglycosylation', 'DNA_demethylation', + 'Deubiquitination', 'Demethylation', 'Dephosphorylation', 'Dehydroxylation', + 'Protein', 'Entity' + ) + self.exclude_tags = ( + 'Methylation', 'Glycosylation', 'Acetylation', 'Ubiquitination', 'DNA_methylation', 'Catalysis', + 'Hydroxylation', 'Phosphorylation', 'Deacetylation', 'Deglycosylation', 'DNA_demethylation', + 'Deubiquitination', 'Demethylation', 'Dephosphorylation', 'Dehydroxylation' + ) + self.relations = ( + 'Theme', 'Site', 'Cause', 'Contextgene', 'Sidechain' + ) + + # -----------------------------------------EZCAT-------------------------------------------- + elif corpora.corpus_type == "ezcat": + self.tags = ( + "Activation", + "BondFormation", + "Cleavage", + "ConformationalChange", + "CouplingReaction", + "Deprotonation", + "Destabilisation", + "ElectrophilicAttack", + "HybridisationChange", + "Inactivation", + "Interaction", + "Modulation", + "NucleophilicAttack", + "Others", + "Protonation", + "Release", + "Stabilisation", + "UncouplingReaction", + "WholeReaction", + "AminoAcid", + "Cofactor", + "EntityProperty", + "Enzyme", + "FunctionalGroup", + "MethodCue", + "NegationCue", + "OtherCompound", + "SpeculationCue", + ) + self.exclude_tags = () + self.relations = ( + "Agent", + "Cue", + "EndPoint", + "InitialPoint", + "Means", + "Theme", + ) + elif corpora.corpus_type == "ezcat_tr": + self.tags = ( + "Activation", + "BondFormation", + "Cleavage", + "ConformationalChange", + "CouplingReaction", + "Deprotonation", + "Destabilisation", + "ElectrophilicAttack", + "HybridisationChange", + "Inactivation", + "Interaction", + "Modulation", + "NucleophilicAttack", + "Others", + "Protonation", + "Release", + "Stabilisation", + "UncouplingReaction", + "WholeReaction", + "AminoAcid", + "Cofactor", + "EntityProperty", + "Enzyme", + "FunctionalGroup", + "MethodCue", + "NegationCue", + "OtherCompound", + "SpeculationCue", + ) + self.exclude_tags = ( + "AminoAcid", + "Cofactor", + "EntityProperty", + "Enzyme", + "FunctionalGroup", + "MethodCue", + "NegationCue", + "OtherCompound", + "SpeculationCue", + ) + self.relations = ( + "Agent", + "Cue", + "EndPoint", + "InitialPoint", + "Means", + "Theme", + ) + elif corpora.corpus_type == "ezcat_en": + self.tags = ( + "Activation", + "BondFormation", + "Cleavage", + "ConformationalChange", + "CouplingReaction", + "Deprotonation", + "Destabilisation", + "ElectrophilicAttack", + "HybridisationChange", + "Inactivation", + "Interaction", + "Modulation", + "NucleophilicAttack", + "Others", + "Protonation", + "Release", + "Stabilisation", + "UncouplingReaction", + "WholeReaction", + "AminoAcid", + "Cofactor", + "EntityProperty", + "Enzyme", + "FunctionalGroup", + "MethodCue", + "NegationCue", + "OtherCompound", + "SpeculationCue", + ) + self.exclude_tags = ( + "Activation", + "BondFormation", + "Cleavage", + "ConformationalChange", + "CouplingReaction", + "Deprotonation", + "Destabilisation", + "ElectrophilicAttack", + "HybridisationChange", + "Inactivation", + "Interaction", + "Modulation", + "NucleophilicAttack", + "Others", + "Protonation", + "Release", + "Stabilisation", + "UncouplingReaction", + "WholeReaction", + ) + self.relations = ( + "Agent", + "Cue", + "EndPoint", + "InitialPoint", + "Means", + "Theme", + ) + + # -----------------------------------------LCGENES-------------------------------------------- + + elif corpora.corpus_type == 'lcgenes': + self.tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Pharmacological_substance', + 'Cell', 'Method_cue', 'Anatomical_entity', 'Cell_component', 'Organic_compound_other', + 'Inorganic_compound', + 'Artificial_process', 'Molecular_function', 'Biological_process', 'Cellular_process', 'Regulation', + ) + self.exclude_tags = ( + ) + self.relations = ( + ) + elif corpora.corpus_type == 'lcgenes_tr': + self.tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Pharmacological_substance', + 'Cell', 'Method_cue', 'Anatomical_entity', 'Cell_component', 'Organic_compound_other', + 'Inorganic_compound', + 'Artificial_process', 'Molecular_function', 'Biological_process', 'Cellular_process', 'Regulation', + ) + self.exclude_tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Pharmacological_substance', + 'Cell', 'Method_cue', 'Anatomical_entity', 'Cell_component', 'Organic_compound_other', + 'Inorganic_compound', + ) + self.relations = ( + ) + elif corpora.corpus_type == 'lcgenes_en': + self.tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Pharmacological_substance', + 'Cell', 'Method_cue', 'Anatomical_entity', 'Cell_component', 'Organic_compound_other', + 'Inorganic_compound', + 'Artificial_process', 'Molecular_function', 'Biological_process', 'Cellular_process', 'Regulation', + ) + self.exclude_tags = ( + 'Artificial_process', 'Molecular_function', 'Biological_process', 'Cellular_process', 'Regulation', + ) + self.relations = ( + ) + # -----------------------------------------IPF-------------------------------------------- + elif corpora.corpus_type == 'ipf': + self.tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Anatomical_entity', 'Cell', 'Method_cue', + 'Pharmacological_substance', 'Entity_Property', 'Organic_compound_other', 'Cell_component', + 'Inorganic_compound', + 'Artificial_process', 'Positive_regulation', 'Gene_expression', 'Negative_regulation', + 'Cellular_process', 'Biological_process', 'Pathway', 'Molecular_function', 'Regulation', 'Migration', + 'Localization', + ) + self.exclude_tags = ( + ) + self.relations = ( + 'Theme', 'Participant', 'Cause', 'disorder', 'atLoc', + ) + elif corpora.corpus_type == 'ipf_tr': + self.tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Anatomical_entity', 'Cell', 'Method_cue', + 'Pharmacological_substance', 'Entity_Property', 'Organic_compound_other', 'Cell_component', + 'Inorganic_compound', + 'Artificial_process', 'Positive_regulation', 'Gene_expression', 'Negative_regulation', + 'Cellular_process', 'Biological_process', 'Pathway', 'Molecular_function', 'Regulation', 'Migration', + 'Localization', + ) + self.exclude_tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Anatomical_entity', 'Cell', 'Method_cue', + 'Pharmacological_substance', 'Entity_Property', 'Organic_compound_other', 'Cell_component', + 'Inorganic_compound', + ) + self.relations = ( + 'Theme', 'Participant', 'Cause', 'disorder', 'atLoc', + ) + elif corpora.corpus_type == 'ipf_en': + self.tags = ( + 'MMLite', 'GGPs', 'Disorder', 'Subject', 'Anatomical_entity', 'Cell', 'Method_cue', + 'Pharmacological_substance', 'Entity_Property', 'Organic_compound_other', 'Cell_component', + 'Inorganic_compound', + 'Artificial_process', 'Positive_regulation', 'Gene_expression', 'Negative_regulation', + 'Cellular_process', 'Biological_process', 'Pathway', 'Molecular_function', 'Regulation', 'Migration', + 'Localization', + ) + self.exclude_tags = ( + 'Artificial_process', 'Positive_regulation', 'Gene_expression', 'Negative_regulation', + 'Cellular_process', 'Biological_process', 'Pathway', 'Molecular_function', 'Regulation', 'Migration', + 'Localization', + ) + self.relations = ( + 'Theme', 'Participant', 'Cause', 'disorder', 'atLoc', + ) + + # -----------------------------------------GPCR-------------------------------------------- + elif corpora.corpus_type == 'gpcr': + self.tags = ( + 'Protein', 'GPCR', 'GPCR-ligand', 'Chemical', 'Cell', 'G-protein', 'Disease', + 'Cell-component', 'Organism', 'Anatomy', 'Entity', + 'Regulation', 'Positive_regulation', 'Biological_process', 'Negative_regulation', 'Pathway', 'Binding', + 'Gene_expression', 'Artificial_process', 'Localization', 'Phosphorylation', 'Internalization', + 'Biosynthesis', 'Conformational-change', 'Degradation', 'Conversion', 'Transportation', 'Dissociation', + 'Transcription', 'Dephosphorylation', 'Translation', + ) + self.exclude_tags = ( + ) + self.relations = ( + 'Theme', 'Cause', 'Participant', 'AtLoc', 'Site', 'Product', 'ToLoc', 'FromLoc', + ) + elif corpora.corpus_type == 'gpcr_tr': + self.tags = ( + 'Protein', 'GPCR', 'GPCR-ligand', 'Chemical', 'Cell', 'G-protein', 'Disease', + 'Cell-component', 'Organism', 'Anatomy', 'Entity', + 'Regulation', 'Positive_regulation', 'Biological_process', 'Negative_regulation', 'Pathway', 'Binding', + 'Gene_expression', 'Artificial_process', 'Localization', 'Phosphorylation', 'Internalization', + 'Biosynthesis', 'Conformational-change', 'Degradation', 'Conversion', 'Transportation', 'Dissociation', + 'Transcription', 'Dephosphorylation', 'Translation', + ) + self.exclude_tags = ( + 'Protein', 'GPCR', 'GPCR-ligand', 'Chemical', 'Cell', 'G-protein', 'Disease', + 'Cell-component', 'Organism', 'Anatomy', 'Entity', + ) + self.relations = ( + 'Theme', 'Cause', 'Participant', 'AtLoc', 'Site', 'Product', 'ToLoc', 'FromLoc', + ) + elif corpora.corpus_type == 'gpcr_en': + self.tags = ( + 'Protein', 'GPCR', 'GPCR-ligand', 'Chemical', 'Cell', 'G-protein', 'Disease', + 'Cell-component', 'Organism', 'Anatomy', 'Entity', + 'Regulation', 'Positive_regulation', 'Biological_process', 'Negative_regulation', 'Pathway', 'Binding', + 'Gene_expression', 'Artificial_process', 'Localization', 'Phosphorylation', 'Internalization', + 'Biosynthesis', 'Conformational-change', 'Degradation', 'Conversion', 'Transportation', 'Dissociation', + 'Transcription', 'Dephosphorylation', 'Translation', + ) + self.exclude_tags = ( + 'Regulation', 'Positive_regulation', 'Biological_process', 'Negative_regulation', 'Pathway', 'Binding', + 'Gene_expression', 'Artificial_process', 'Localization', 'Phosphorylation', 'Internalization', + 'Biosynthesis', 'Conformational-change', 'Degradation', 'Conversion', 'Transportation', 'Dissociation', + 'Transcription', 'Dephosphorylation', 'Translation', + ) + self.relations = ( + 'Theme', 'Cause', 'Participant', 'AtLoc', 'Site', 'Product', 'ToLoc', 'FromLoc', + ) + + # -----------------------------------------GE04-------------------------------------------- + elif corpora.corpus_type == 'genia04': + self.tags = ( + "protein", + "DNA", + "cell_type", + "cell_line", + "RNA", + ) + self.exclude_tags = ( + ) + self.relations = ( + ) + + self.actual_tags = (tag for tag in self.tags if tag not in self.exclude_tags) # Not use set to keep order + for g, s in corpora.docs: + evaluator = SingleEvaluator(g, s, 2, mode, tag_type, verbose=verbose, exclude_tags=self.exclude_tags) + for target in ('tags', 'relations'): + for score in ('tp', 'fp', 'fn'): + self.scores[target][score] += evaluator.scores[target][score] + measures = Measures(tp=evaluator.scores[target]['tp'], + fp=evaluator.scores[target]['fp'], + fn=evaluator.scores[target]['fn'], + tn=evaluator.scores[target]['tn']) + for score in ('precision', 'recall', 'f1'): + fn = getattr(measures, score) + self.scores[target]['macro'][score] += fn() + + for target in ('tags', 'relations'): + # Normalization + for key in self.scores[target]['macro'].keys(): + self.scores[target]['macro'][key] = \ + self.scores[target]['macro'][key] / len(corpora.docs) + + measures = Measures(tp=self.scores[target]['tp'], + fp=self.scores[target]['fp'], + fn=self.scores[target]['fn'], + tn=self.scores[target]['tn']) + for key in self.scores[target]['micro'].keys(): + fn = getattr(measures, key) + self.scores[target]['micro'][key] = fn() + + +def evaluate(corpora, mode='strict', verbose=False): + """Run the evaluation by considering only files in the two folders.""" + assert mode in ('strict', 'lenient') + evaluator_s = MultipleEvaluator(corpora, verbose) + if corpora.track == 1: + macro_f1, macro_auc = 0, 0 + print('{:*^96}'.format(' TRACK 1 ')) + print('{:20} {:-^30} {:-^22} {:-^14}'.format('', ' met ', + ' not met ', + ' overall ')) + print('{:20} {:6} {:6} {:6} {:6} {:6} {:6} {:6} {:6} {:6}'.format( + '', 'Prec.', 'Rec.', 'Speci.', 'F(b=1)', 'Prec.', 'Rec.', 'F(b=1)', 'F(b=1)', 'AUC')) + for tag in evaluator_s.tags: + print( + '{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + tag.capitalize(), + evaluator_s.scores[(tag, 'met', 'p')], + evaluator_s.scores[(tag, 'met', 'r')], + evaluator_s.scores[(tag, 'met', 'specificity')], + evaluator_s.scores[(tag, 'met', 'f1')], + evaluator_s.scores[(tag, 'not met', 'p')], + evaluator_s.scores[(tag, 'not met', 'r')], + evaluator_s.scores[(tag, 'not met', 'f1')], + (evaluator_s.scores[(tag, 'met', 'f1')] + evaluator_s.scores[(tag, 'not met', 'f1')]) / 2, + evaluator_s.scores[(tag, 'met', 'auc')])) + macro_f1 += (evaluator_s.scores[(tag, 'met', 'f1')] + evaluator_s.scores[(tag, 'not met', 'f1')]) / 2 + macro_auc += evaluator_s.scores[(tag, 'met', 'auc')] + print('{:20} {:-^30} {:-^22} {:-^14}'.format('', '', '', '')) + m = Measures(tp=evaluator_s.values['met']['tp'], + fp=evaluator_s.values['met']['fp'], + fn=evaluator_s.values['met']['fn'], + tn=evaluator_s.values['met']['tn']) + nm = Measures(tp=evaluator_s.values['not met']['tp'], + fp=evaluator_s.values['not met']['fp'], + fn=evaluator_s.values['not met']['fn'], + tn=evaluator_s.values['not met']['tn']) + print( + '{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + 'Overall (micro)', m.precision(), m.recall(), m.specificity(), + m.f1(), nm.precision(), nm.recall(), nm.f1(), + (m.f1() + nm.f1()) / 2, m.auc())) + print( + '{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + 'Overall (macro)', + evaluator_s.scores[('macro', 'met', 'p')], + evaluator_s.scores[('macro', 'met', 'r')], + evaluator_s.scores[('macro', 'met', 'specificity')], + evaluator_s.scores[('macro', 'met', 'f1')], + evaluator_s.scores[('macro', 'not met', 'p')], + evaluator_s.scores[('macro', 'not met', 'r')], + evaluator_s.scores[('macro', 'not met', 'f1')], + macro_f1 / len(evaluator_s.tags), + evaluator_s.scores[('macro', 'met', 'auc')])) + print() + print('{:>20} {:^74}'.format('', ' {} files found '.format(len(corpora.docs)))) + else: + evaluator_l = MultipleEvaluator(corpora, mode='lenient', verbose=verbose) + print('{:*^70}'.format(' TRACK 2 ')) + print('{:20} {:-^22} {:-^22}'.format('', ' strict ', ' lenient ')) + print('{:20} {:6} {:6} {:6} {:6} {:6} {:6}'.format('', 'Prec.', + 'Rec.', + 'F(b=1)', + 'Prec.', + 'Rec.', + 'F(b=1)')) + for tag in evaluator_s.actual_tags: + evaluator_tag_s = MultipleEvaluator(corpora, tag, verbose=verbose) + evaluator_tag_l = MultipleEvaluator(corpora, tag, mode='lenient', verbose=verbose) + print( + '{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:>5} {:>5} {:>5} {:>5} {:>5} {:>5}'.format( + tag.capitalize(), + evaluator_tag_s.scores['tags']['micro']['precision'], + evaluator_tag_s.scores['tags']['micro']['recall'], + evaluator_tag_s.scores['tags']['micro']['f1'], + evaluator_tag_l.scores['tags']['micro']['precision'], + evaluator_tag_l.scores['tags']['micro']['recall'], + evaluator_tag_l.scores['tags']['micro']['f1'], + evaluator_tag_s.scores['tags']['tp'] + + evaluator_tag_s.scores['tags']['fp'], + evaluator_tag_s.scores['tags']['tp'] + + evaluator_tag_s.scores['tags']['fn'], + evaluator_tag_s.scores['tags']['tp'], + evaluator_tag_l.scores['tags']['tp'] + + evaluator_tag_l.scores['tags']['fp'], + evaluator_tag_l.scores['tags']['tp'] + + evaluator_tag_l.scores['tags']['fn'], + evaluator_tag_l.scores['tags']['tp'])) + print('{:>20} {:-^48}'.format('', '')) + print('{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + 'Overall (micro)', + evaluator_s.scores['tags']['micro']['precision'], + evaluator_s.scores['tags']['micro']['recall'], + evaluator_s.scores['tags']['micro']['f1'], + evaluator_l.scores['tags']['micro']['precision'], + evaluator_l.scores['tags']['micro']['recall'], + evaluator_l.scores['tags']['micro']['f1'])) + print('{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + 'Overall (macro)', + evaluator_s.scores['tags']['macro']['precision'], + evaluator_s.scores['tags']['macro']['recall'], + evaluator_s.scores['tags']['macro']['f1'], + evaluator_l.scores['tags']['macro']['precision'], + evaluator_l.scores['tags']['macro']['recall'], + evaluator_l.scores['tags']['macro']['f1'])) + print() + + print('{:*^70}'.format(' RELATIONS ')) + for rel in evaluator_s.relations: + evaluator_tag_s = MultipleEvaluator(corpora, rel, mode='strict', verbose=verbose) + evaluator_tag_l = MultipleEvaluator(corpora, rel, mode='lenient', verbose=verbose) + print( + '{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:>5} {:>5} {:>5} {:>5} {:>5} {:>5}'.format( + '{}'.format(rel), + evaluator_tag_s.scores['relations']['micro']['precision'], + evaluator_tag_s.scores['relations']['micro']['recall'], + evaluator_tag_s.scores['relations']['micro']['f1'], + evaluator_tag_l.scores['relations']['micro']['precision'], + evaluator_tag_l.scores['relations']['micro']['recall'], + evaluator_tag_l.scores['relations']['micro']['f1'], + evaluator_tag_s.scores['relations']['tp'] + + evaluator_tag_s.scores['relations']['fp'], + evaluator_tag_s.scores['relations']['tp'] + + evaluator_tag_s.scores['relations']['fn'], + evaluator_tag_s.scores['relations']['tp'], + evaluator_tag_l.scores['relations']['tp'] + + evaluator_tag_l.scores['relations']['fp'], + evaluator_tag_l.scores['relations']['tp'] + + evaluator_tag_l.scores['relations']['fn'], + evaluator_tag_l.scores['relations']['tp'])) + print('{:>20} {:-^48}'.format('', '')) + print('{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + 'Overall (micro)', + evaluator_s.scores['relations']['micro']['precision'], + evaluator_s.scores['relations']['micro']['recall'], + evaluator_s.scores['relations']['micro']['f1'], + evaluator_l.scores['relations']['micro']['precision'], + evaluator_l.scores['relations']['micro']['recall'], + evaluator_l.scores['relations']['micro']['f1'])) + print('{:>20} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f} {:<5.4f}'.format( + 'Overall (macro)', + evaluator_s.scores['relations']['macro']['precision'], + evaluator_s.scores['relations']['macro']['recall'], + evaluator_s.scores['relations']['macro']['f1'], + evaluator_l.scores['relations']['macro']['precision'], + evaluator_l.scores['relations']['macro']['recall'], + evaluator_l.scores['relations']['macro']['f1'])) + print() + print('{:20}{:^48}'.format('', ' {} files found '.format(len(corpora.docs)))) + + +class Corpora(object): + + def __init__(self, corpus_type, folder1, folder2, track_num): + extensions = {1: '*.xml', 2: '*.ann'} + file_ext = extensions[track_num] + self.track = track_num + self.folder1 = folder1 + self.folder2 = folder2 + self.corpus_type = corpus_type + files1 = set([os.path.basename(f) for f in glob.glob( + os.path.join(folder1, file_ext))]) + # print(files1) + files2 = set([os.path.basename(f) for f in glob.glob( + os.path.join(folder2, file_ext))]) + # print(files2) + common_files = files1 & files2 # intersection + if not common_files: + print('ERROR: None of the files match.') + else: + if files1 - common_files: + print('Files skipped in {}:'.format(self.folder1)) + print(', '.join(sorted(list(files1 - common_files)))) + if files2 - common_files: + print('Files skipped in {}:'.format(self.folder2)) + print(', '.join(sorted(list(files2 - common_files)))) + self.docs = [] + for file in common_files: + if track_num == 1: + g = RecordTrack1(os.path.join(self.folder1, file)) + s = RecordTrack1(os.path.join(self.folder2, file)) + else: + g = RecordTrack2(os.path.join(self.folder1, file)) + s = RecordTrack2(os.path.join(self.folder2, file)) + self.docs.append((g, s)) + + +def main(corpus_type, f1, f2, track, verbose): + """Where the magic begins.""" + corpora = Corpora(corpus_type, f1, f2, track) + if corpora.docs: + evaluate(corpora, verbose=verbose) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='n2c2: Evaluation script for Track 2') + parser.add_argument('folder1', help='First data folder path (gold)') + parser.add_argument('folder2', help='Second data folder path (system)') + parser.add_argument('--ner-eval-corpus', dest='corpus_type', type=str, required=True, + help='ace / cg / cg_tr / ace_tr') + args = parser.parse_args() + main(args.corpus_type, os.path.abspath(args.folder1), os.path.abspath(args.folder2), 2, False) diff --git a/loader/prepData/brat.py b/loader/prepData/brat.py index 42f8e3f..b2ba019 100644 --- a/loader/prepData/brat.py +++ b/loader/prepData/brat.py @@ -3,33 +3,80 @@ import glob import collections from collections import OrderedDict +import os def brat_loader(files_fold, params): file_list = glob.glob(files_fold + '*' + '.txt') + triggers = OrderedDict() entities = OrderedDict() - + relations = OrderedDict() + events = OrderedDict() sentences = OrderedDict() for filef in sorted(file_list): if filef.split("/")[-1].startswith("."): continue - filename = filef.split('/')[-1].split('.txt')[0] ffolder = '/'.join(filef.split('/')[:-1]) + '/' + + + # store data for each document + ftriggers = OrderedDict() fentities = OrderedDict() + frelations = OrderedDict() + fevents = OrderedDict() + + idsTR = [] + typesTR = [] + infoTR = OrderedDict() + termsTR = [] idsT = [] typesT = [] infoT = OrderedDict() termsT = [] + idsR = [] + typesR = [] + infoR = OrderedDict() + + idsE = [] + infoE = OrderedDict() + infoM = OrderedDict() + + # # check empty file, otherwise, create an empty file to fix bug pipeline (temporarily) + # filepath = ffolder + filename + '.ann' + # if not os.path.isfile(filepath): + # with open(filepath, 'w') as f: + # print('EMPTY FILE: ', filepath) + with open(ffolder + filename + '.ann', encoding="UTF-8") as infile: for line in infile: - if line.startswith('T'): + if line.startswith('TR'): + line = line.rstrip().split('\t') + trId = line[0] + tr1 = line[1].split() + trType = tr1[0] + pos1 = tr1[1] + pos2 = tr1[2] + text = line[2] + + idsTR.append(trId) + typesTR.append(trType) + trigger_info = OrderedDict() + trigger_info['id'] = trId + trigger_info['type'] = trType + trigger_info['pos1'] = pos1 + trigger_info['pos2'] = pos2 + trigger_info['text'] = text + infoTR[trId] = trigger_info + termsTR.append([trId, trType, pos1, pos2, text]) + + elif line.startswith('T'): line = line.rstrip().split('\t') eid = line[0] e1 = line[1].split() @@ -49,7 +96,98 @@ def brat_loader(files_fold, params): infoT[eid] = ent_info termsT.append([eid, etype, pos1, pos2, text]) + elif line.startswith('R'): + line = line.rstrip().split('\t') + idR = line[0] + typeR = line[1].split()[0] + typeR = ''.join([i for i in typeR if not i.isdigit()]) + args = line[1].split()[1:] + arg1id = args[0].split(':')[1] + arg2id = args[1].split(':')[1] + + trig2 = False + trig1 = False + if arg1id.startswith('TR') and arg2id.startswith('TR'): + trig2 = True + trig1 = True + elif arg1id.startswith('TR'): + trig1 = True + + r_info = OrderedDict() + r_info['id'] = idR + r_info['type'] = typeR + r_info['arg1id'] = arg1id + r_info['arg2id'] = arg2id + r_info['2trigger'] = trig2 + r_info['1trigger'] = trig1 + + idsR.append(idR) + typesR.append(typeR) + infoR[idR] = r_info + + elif line.startswith('E'): + line = line.rstrip().split('\t') + idE = line[0] + args = line[1].split() + tr1 = args[0].split(':') + trType = tr1[0] + trId = tr1[1] + args_num = len(args) - 1 + + nestedEv_ = [] + args2 = [] + args_ids = [] + for xx, arg in enumerate(args[1:]): + role, eid = arg.split(':') + role = ''.join([i for i in role if not i.isdigit()]) + args2.append((role, eid)) + args_ids.append(eid) + if eid.startswith('E'): + nestedEv_.append(eid) + + zeroArg = False + if len(args2) == 0: + args2 = [()] + zeroArg = True + + if len(nestedEv_) > 0: + evArg = True + else: + evArg = False + + idsE.append(idE) + e_info = OrderedDict() + e_info['id'] = idE + e_info['trid'] = trId + e_info['trtype'] = trType + e_info['args_num'] = args_num + e_info['args_data'] = args2 + e_info['is_zeroArg'] = zeroArg + e_info['is_nested_ev'] = evArg + e_info['nested_events'] = nestedEv_ + e_info['is_flat_ev'] = len(nestedEv_) == 0 + e_info['args_ids'] = args_ids + + e_info['modality'] = 'non-modality' + + infoE[idE] = e_info + + elif line.startswith('M'): + line = line.rstrip().split('\t') + modals = line[1].split(' ') + idev = modals[1] + modal_type = modals[0] + infoM[idev] = modal_type + + typesTR2 = dict(collections.Counter(typesTR)) typesT2 = dict(collections.Counter(typesT)) + typesR2 = dict(collections.Counter(typesR)) + + ftriggers['data'] = infoTR + ftriggers['types'] = typesTR + ftriggers['counted_types'] = typesTR2 + ftriggers['ids'] = idsTR + ftriggers['terms'] = termsTR fentities['data'] = infoT fentities['types'] = typesT @@ -57,12 +195,26 @@ def brat_loader(files_fold, params): fentities['ids'] = idsT fentities['terms'] = termsT - # check empty entities - if len(idsT) == 0 and not params['raw_text']: + frelations['data'] = infoR + frelations['types'] = typesR + frelations['ids'] = idsR + frelations['counted_types'] = typesR2 + + for evid, modal_type in infoM.items(): + infoE[evid]['modality'] = modal_type + + fevents['data'] = infoE + fevents['ids'] = idsE + + # check empty + if len(idsT) == len(idsTR) == 0: continue else: entities[filename] = fentities + triggers[filename] = ftriggers + relations[filename] = frelations + events[filename] = fevents lowerc = params['lowercase'] with open(ffolder + filename + '.txt', encoding="UTF-8") as infile: @@ -75,4 +227,4 @@ def brat_loader(files_fold, params): lines.append(line) sentences[filename] = lines - return entities, sentences + return triggers, entities, relations, events, sentences diff --git a/loader/prepData/entity.py b/loader/prepData/entity.py index 8f99437..0fdb3e9 100644 --- a/loader/prepData/entity.py +++ b/loader/prepData/entity.py @@ -28,14 +28,17 @@ def process_etypes(entities0): return entities1 -def process_tags(entities1): +def process_tags(entities1, triggers1): typesT = entities1['types'] + typesTR = triggers1['types'] + types2 = typesT + typesTR tags = [] + tagsTR = [] tags2types = OrderedDict() tags2types['O'] = 'O' - for type in typesT: + for type in types2: btag = 'B-' + type itag = 'I-' + type tags.append(btag) @@ -43,11 +46,17 @@ def process_tags(entities1): tags2types[btag] = type tags2types[itag] = type + if type in typesTR: + tagsTR.append(btag) + tagsTR.append(itag) + tags0 = OrderedDict() - tags0['types'] = typesT + tags0['types'] = types2 + tags0['typesTR'] = typesTR tags0['typesT'] = typesT tags0['tags'] = tags tags0['tags2types'] = tags2types + tags0['tagsTR'] = tagsTR return tags0 @@ -147,8 +156,9 @@ def spliter(line, _len=len): return offsets -def process_entities(entities1, sentences1, params, dirpath): +def process_entities(entities1, triggers1, sentences1, params, dirpath): entities0 = entities1['pmids'] + triggers0 = triggers1['pmids'] input0 = OrderedDict() @@ -157,11 +167,15 @@ def process_entities(entities1, sentences1, params, dirpath): for pmid in entities0: entities = entities0[pmid] + triggers = triggers0[pmid] sentences = sentences0[pmid] terms = entities['terms'] + terms.extend(triggers['terms']) nest_level, terms = count_nest_level(terms, params) + # nest_level, terms = utils.count_nest_level(terms) + # terms, file_discard_count = utils.dicard_invalid_nes(terms, sentences) levels.append(nest_level) abst_text = '\n'.join([sent['sentence'] for sent in sentences]) @@ -177,14 +191,31 @@ def process_entities(entities1, sentences1, params, dirpath): init_char = next_char spans.append((init_char, next_char)) + # doc_data = [] + # tags_ = [] + # terms_ = [] for xx, sentence in enumerate(sentences): + # offsets, words = calculate_offset(sentences, xx) offsets = sentence['offsets'] + # words = sentence['words'] + # chars = sentence['chars'] + # sent = sentence['sentence'] + # nner + # tags, terms_sentence = utils.assign_label(offsets, terms) tags, tags_terms, terms_sentence = assign_label(offsets, terms) + # tags_.append(tags) + # terms_.append(terms_sentence) + + # check sentence has no entity + # if len(terms_sentence) == 0: + # print('NO ENTITY: ', pmid, xx, sentence['sentence']) + + # tags_.extend([tag for level in tags for tag in level]) # for nested sentence['tags'] = tags sentence['terms'] = terms_sentence - + # nner sentence['tags_terms'] = tags_terms eids = [] @@ -196,17 +227,25 @@ def process_entities(entities1, sentences1, params, dirpath): for eid in eids: if eid in entities['data']: readable_ents[eid] = entities['data'][eid] + else: + readable_ents[eid] = triggers['data'][eid] + # sentence['readable_ents'] = readable_ents + # offsets2 span = spans[xx] - + # offs2 = [] + # etypes2 = [] for x, id_ in enumerate(eids): # for every entity if it belongs to sentence span ent = readable_ents[id_] b = int(ent['pos1']) e = int(ent['pos2']) + # b, e = offs[x] if (span[0] <= b <= span[1]) and (span[0] <= e <= span[1]): b2 = b - span[0] e2 = e - span[0] + # offs2.append([b2, e2]) # + # etypes2.append(ent['type']) ent['offs2'] = [b2, e2] else: print("SKIP ENTITY: " + str(b) + " --- " + str(e)) @@ -215,6 +254,7 @@ def process_entities(entities1, sentences1, params, dirpath): tokens = spliter( sentence['sentence']) # we have the tokens of the sentence and their corresponding offsets + tokensN = [tok for tok, b, e in tokens] for eid in eids: if "offs2" not in readable_ents[eid]: @@ -250,6 +290,7 @@ def process_entities(entities1, sentences1, params, dirpath): pad_label = [['O'] * len(tags[0])] tags.extend(pad_label * pad_level) + # nner tags_terms = sentence['tags_terms'] pad_label = [['O'] * len(tags_terms[0])] tags_terms.extend(pad_label * pad_level) @@ -261,10 +302,31 @@ def process_entities(entities1, sentences1, params, dirpath): return input0 +def entity_tags(dico): + """ + Create a dictionary and a mapping of tags + """ + id_to_tag = {0: 'O'} + id_to_type = {0: 'O'} + # id_to_tag = {} + # id_to_type = {} + for i, (k, v) in enumerate(dico.items()): + # if v != 'O': + id_to_tag[2 * i + 1] = 'I-' + v + id_to_tag[2 * i + 2] = 'B-' + v + id_to_type[2 * i + 2] = v + + tag_to_id = {v: k for k, v in id_to_tag.items()} + type_to_id = {v: k for k, v in id_to_type.items()} + + return id_to_tag, tag_to_id, id_to_type, type_to_id + + def extract_entities(sw_sentence, tag2id_mapping, id2tag_mapping, nn_mapping): # For several edge cases max_depth = max(len(tags) for _, tags, _ in sw_sentence) + # for sentence in sentences: (not using loop) entities = defaultdict(list) terms = defaultdict(list) @@ -279,25 +341,27 @@ def extract_entities(sw_sentence, tag2id_mapping, id2tag_mapping, nn_mapping): try: tags = np.asarray( - [ - [tag2id_mapping[tag] for tag in tags + ["O"] * max_depth][ - :max_depth - ] - for _, tags, tags_terms in sw_sentence + [ + # bug: original + # [tag2id_mapping[tag] if tag in tag2id_mapping else tag2id_mapping["O"] for tag in tags + ["O"] * max_depth][ + [tag2id_mapping[tag] for tag in tags + ["O"] * max_depth][ + :max_depth ] + for _, tags, tags_terms in sw_sentence + ] ).T except KeyError as err: tags = np.asarray( - [ - [tag2id_mapping[tag] if tag in tag2id_mapping else tag2id_mapping["O"] for tag in - tags + ["O"] * max_depth][ - :max_depth - ] - for _, tags, tags_terms in sw_sentence + [ + [tag2id_mapping[tag] if tag in tag2id_mapping else tag2id_mapping["O"] for tag in tags + ["O"] * max_depth][ + :max_depth ] + for _, tags, tags_terms in sw_sentence + ] ).T print(err) + tags_terms = np.asarray( [ [tag_term for tag_term in tags_terms + ["O"] * max_depth][ @@ -393,3 +457,37 @@ def convert_to_sub_words(word_tokens, tags, tags_terms, tokenizer=None): sw_sentence.append([token] + [tags[token_idx], tags_terms[token_idx]]) subword_offset_mapping[token_idx] = token_idx return sw_sentence, subword_offset_mapping, subwords, valid_starts + +def convert_to_sub_words_lstm(word_tokens, tags, tags_terms, tokenizer=None): + subword_pos = 0 + subword_offset_mapping = {} + subwords = [] + sw_sentence = [] + + valid_starts = {0} + + for token_idx, token in enumerate(word_tokens): + if tokenizer: + # subtokens = tokenizer.tokenize(token) + subtokens = [token] + if subtokens: + sw_sentence.append(subtokens[:1] + [tags[token_idx], tags_terms[token_idx]]) + subword_offset_mapping[subword_pos] = token_idx + subword_pos += 1 + # subwords.append(subtokens[:1][0]) + subwords.append(subtokens[0]) + + labels = [re.sub("^B-", "I-", label) for label in tags[token_idx]] + ids = [re.sub("^B-", "I-", _id) for _id in tags_terms[token_idx]] + + for subtoken in subtokens[1:]: + sw_sentence.append([subtoken] + [labels, ids]) + subword_offset_mapping[subword_pos] = token_idx + subword_pos += 1 + subwords.append(subtoken) + + valid_starts.add(len(subwords)) + else: + sw_sentence.append([token] + [tags[token_idx], tags_terms[token_idx]]) + subword_offset_mapping[token_idx] = token_idx + return sw_sentence, subword_offset_mapping, subwords, valid_starts diff --git a/loader/prepData/event.py b/loader/prepData/event.py new file mode 100644 index 0000000..4790ed1 --- /dev/null +++ b/loader/prepData/event.py @@ -0,0 +1,270 @@ +"""Process events.""" + +import collections +from collections import OrderedDict + + +def count_nested_events(events): + for pmid, fevents in events.items(): + count_nested_evs_level(fevents['data']) + + +def count_nested_evs_level(fevents): + for evid, evdata in fevents.items(): + level = count_nested_ev_level(evdata, fevents, current_level=0) + evdata['nested_ev_level'] = level + + +def count_nested_ev_level(evdata, fevents, current_level): + """Nested event level""" + + # to avoid loop forever + if current_level > 20: + return current_level + + # flat + if evdata['is_flat_ev']: + return current_level + + # nested + elif 'nested_ev_level' in evdata: + return current_level + evdata['nested_ev_level'] + + else: + levels = [] + args_ids = evdata['args_ids'] + for arg_id in args_ids: + if arg_id.startswith('E'): + arg_evdata = fevents[arg_id] + if arg_evdata['is_flat_ev']: + levels.append(current_level + 1) + else: + arg_level = count_nested_ev_level(arg_evdata, fevents, current_level + 1) + levels.append(arg_level) + + level = max(levels) + return level + + +def extract_events(events0, entities1): + """Extract event data""" + + nflat = 0 + n1nested = 0 + nevents = 0 + + events1 = OrderedDict() + for pmid in events0: + events = events0[pmid]['data'] + idsE = events0[pmid]['ids'] + entities = entities1['pmids'][pmid]['data'] + ev2_ = OrderedDict() + + nevents += len(idsE) + + # Read event data + for idE in events: + event = events[idE] + args_data = event['args_data'] + + nestedE = OrderedDict() + if event['is_nested_ev']: + n1nested += 1 + for idnE in event['nested_events']: + nE = events[idnE] + nestedE[idnE] = nE + else: + nflat += 1 + + event['nested_events_info'] = nestedE + + argTypes = [] + argEntities = [] + if event['args_num'] > 0: + for arg in args_data: + typeR = arg[0] + typeR = ''.join([i for i in typeR if not i.isdigit()]) + eid = arg[1] + if eid in entities: + typeT = entities[eid]['type'] + typeArg = typeR + '->' + typeT + eArg = typeR + '->' + eid + else: + typeT = 'E' + typeArg = (typeR, typeT) + eArg = (typeR, eid) + argTypes.append(typeArg) + argEntities.append(eArg) + + event['args_types'] = argTypes + event['args_entities'] = argEntities + ev2_[idE] = event + + # Process nested events + for idE in ev2_: + event = ev2_[idE] + nestedE2 = False + if event['is_nested_ev']: + argsTypes = event['args_types'] + argsTypes2 = [] + for xx, arg in enumerate(event['args_data']): + typeR = arg[0] + eid = arg[1] + typeArg = argsTypes[xx] + if eid not in entities: + nEvent = events[eid] + if nEvent['is_nested_ev']: + typeArg = (typeArg[0], 'nestedEV') + nestedE2 = True + else: + if nEvent['is_zeroArg']: + typenEvent = ('Nested1', nEvent['trtype'], ['None']) + else: + typenEvent = ('Nested1', nEvent['trtype'], nEvent['args_types']) + + typeArg = (typeR, typenEvent) + + argsTypes2.append(typeArg) + + event['args_types'] = argsTypes2 + event['is_nested_ev_level2'] = nestedE2 + ev2_[idE] = event + + events1[pmid] = ev2_ + + evNums = OrderedDict() + evNums['ev_num'] = nevents + evNums['ev_flat'] = nflat + evNums['nested_level1'] = n1nested + + events3 = OrderedDict() + events3['pmids'] = events1 + events3['evNum'] = evNums + + return events3 + + +def string2pair(st): + """Parse line to event structure""" + + pairs = [] + + pairs0 = st.split('+') + for pair in pairs0: + if '0' in pair: + pair0 = pair.split('0') + pairs.append(pair0) + elif '1' in pair: + pair0 = pair.split('1') + pairs.append(pair0) + elif '2' in pair: + pair0 = pair.split('2') + pairs.append(pair0) + elif '3' in pair: + pair0 = pair.split('3') + pairs.append(pair0) + + return pairs + + +def count_structures(structs0): + """Event structure""" + + for typeTR, structs in structs0.items(): + + # store structures by each trigger type + structs_counts = dict(collections.Counter(structs)) + structs_data = OrderedDict() + + for struct, count in structs_counts.items(): + pairs = string2pair(struct) + # structs_data[struct] = [pairs, count] + structs_data[struct] = [pairs] + + # store structure data + structs0[typeTR] = structs_data + + return structs0 + + +def extract_trigger_structures(events1, entities1): + """Event structure by trigger type""" + + structs0 = collections.defaultdict(list) + structs1 = collections.defaultdict(list) + + n_events = 0 + n_1events = 0 + + for pmid in events1['pmids']: + events = events1['pmids'][pmid] + entities = entities1['pmids'][pmid]['data'] + + for idE in events: + event = events[idE] + trtype = event['trtype'] + args_data = event['args_data'] + + n_events += 1 + + # nested event + if event['is_nested_ev']: + n_1events += 1 + trtype = event['trtype'] + args_data = event['args_data'] + + args_type = '' + for pair in args_data: + if len(pair) > 0: + typeR = pair[0] + + # event argument + A2 = pair[1] + + # argument is entity: flat + if A2 in entities: + typeA2 = entities[A2]['type'] + type1 = typeR + '0' + typeA2 + + # argument is event: nested + else: + typeA2 = events[A2]['trtype'] + type1 = typeR + '1' + typeA2 + + else: + type1 = 'None' + '0' + trtype + if len(args_type) > 0: + args_type += '+' + args_type += type1 + event['args_type'] = args_type + + structs1[trtype].append(args_type) + + # flat event + else: + args_type = '' + for pair in args_data: + if len(pair) > 0: + typeR = pair[0] + if pair[1] not in entities: + print(pmid, pair[1]) + continue + typeT = entities[pair[1]]['type'] + type1 = typeR + '0' + typeT + else: + type1 = 'None' + '0' + trtype + if len(args_type) > 0: + args_type += '+' + args_type += type1 + + event['args_type'] = args_type + + structs0[trtype].append(args_type) + + structs0 = count_structures(structs0) + structs1 = count_structures(structs1) + + print('events: ', n_events, ' flat events: ', (n_events - n_1events)) + print('nested: ', n_1events) + + return {'structs0': structs0, 'structs1': structs1}, events1 diff --git a/loader/prepData/prepdata.py b/loader/prepData/prepdata.py index 377a42f..cb5a6f0 100644 --- a/loader/prepData/prepdata.py +++ b/loader/prepData/prepdata.py @@ -1,27 +1,43 @@ -"""Load data from brat format and process for entity""" - -from collections import OrderedDict +"""Load data from brat format and process for entity, trigger, relation, events.""" from loader.prepData.brat import brat_loader from loader.prepData.sentence import prep_sentence_offsets, process_input from loader.prepData.entity import process_etypes, process_tags, process_entities +from loader.prepData.event import extract_events, count_nested_events, extract_trigger_structures def prep_input_data(files_fold, params): # load data from *.ann files - entities0, sentences0 = brat_loader(files_fold, params) + triggers0, entities0, relations0, events0, sentences0 = brat_loader(files_fold, params) # sentence offsets sentences1 = prep_sentence_offsets(sentences0) + if 'pipeline_text_data' in params: + sent_words = [] + for pmid in sentences0: + doc_data = params['pipeline_text_data'][pmid] + for sent, pipe_sent in zip(sentences1['doc_data'][pmid], doc_data): + sent['words'] = pipe_sent['words'] + sent_words.append(sent['words']) + sent['offsets'] = pipe_sent['offsets'] + sentences1['sent_words'] = sent_words # entity entities1 = process_etypes(entities0) # all entity types - terms0 = process_tags(entities1) # terms, offset, tags, etypes - input0 = process_entities(entities1, sentences1, params, files_fold) + triggers1 = process_etypes(triggers0) # all trigger types + terms0 = process_tags(entities1, triggers1) # terms, offset, tags, etypes + input0 = process_entities(entities1, triggers1, sentences1, params, files_fold) + + # event + count_nested_events(events0) + events1 = extract_events(events0, entities1) + structsTR, events2 = extract_trigger_structures(events1, entities1) # prepare for training batch data for each sentence - input1 = process_input(input0) + input1 = process_input(input0, entities0, relations0, events2, params, files_fold) + # + print("Missing gold entities:") for doc_name, doc in sorted(input0.items(), key=lambda x: x[0]): entities = set() num_entities_per_doc = 0 @@ -35,18 +51,5 @@ def prep_input_data(files_fold, params): if diff: print(doc_name, sorted(diff, key=lambda _id: int(_id.replace("T", "")))) - # entity indices - g_entity_ids_ = OrderedDict() - for fid, fdata in entities0.items(): - # get max entity id - eid_ = [eid for eid in fdata['ids'] if not eid.startswith('TR')] - ids_ = [int(eid.replace('T', '')) for eid in eid_] - if len(ids_) > 0: - max_id = max(ids_) - else: - max_id = 0 - eid_.append(max_id) - g_entity_ids_[fid] = eid_ - - return {'entities': entities1, 'terms': terms0, 'sentences': sentences1, 'input': input1, - 'g_entity_ids_': g_entity_ids_} + return {'entities': entities1, 'triggers': triggers1, 'terms': terms0, 'relations': relations0, 'events': events0, + 'sentences': sentences1, 'input': input1, 'structsTR': structsTR} diff --git a/loader/prepData/relation.py b/loader/prepData/relation.py new file mode 100644 index 0000000..12c2603 --- /dev/null +++ b/loader/prepData/relation.py @@ -0,0 +1,118 @@ +"""Process relation information.""" + +from collections import OrderedDict + + +def process_relations(readable_entsA, readable_entsB, readable_ents, true_relations, unk, params): + r_idxs = OrderedDict() + readable_rels = OrderedDict() + + for e1, ent1 in enumerate(readable_entsA): # ent1 is A + if ent1 not in r_idxs: + r_idxs[ent1] = list(readable_ents.keys()).index( + ent1) # find to which index corresponds from all entities + for e2, ent2 in enumerate(readable_entsB): + if ent2 not in r_idxs: # ent2 is B + r_idxs[ent2] = list(readable_ents.keys()).index(ent2) + + if (ent1, ent2) not in readable_rels: + readable_rels[(ent1, ent2)] = [] + if (ent2, ent1) not in readable_rels: + readable_rels[(ent2, ent1)] = [] + + # A before B (in text) + Apos = readable_ents[ent1]['pos2'] + Bpos = readable_ents[ent2]['pos1'] + + # if readable_ents[ent1][4][-1] <= readable_ents[ent2][4][0]: + if Apos <= Bpos: + pref_f = '' + pref_b = '_INV' + arg1 = ent1 + arg2 = ent2 + # B before A (in text) + else: + pref_f = '_INV' + pref_b = '' + arg1 = ent2 + arg2 = ent1 + + Fpair = [('Arg1', arg1), ('Arg2', arg2)] # forward + Rpair = [('Arg1', arg2), ('Arg2', arg1)] # reverse + + total_rels = len(true_relations) + not_found = 0 + for rel in true_relations: # existing relations + + if rel[1] == 'Other': # in case negative relations are already labeled + # left-to-right + readable_rels[(arg1, arg2)] = (rel[0] + pref_f, '1:Other:2') + # right-to-left + if params['direction'] != 'l2r': + readable_rels[(arg2, arg1)] = (rel[0] + pref_b, '1:Other:2') + + # AB existing relation + if Fpair == true_relations[rel]: + # left-to-right + if len(readable_rels[(arg1, arg2)]) == 0: + readable_rels[(arg1, arg2)] = (rel[0] + pref_f, '1:' + rel[1] + ':2') + # right-to-left + if params['direction'] == 'neg': + readable_rels[(arg2, arg1)] = (rel[0] + pref_b, '1:Other:2') + elif params['direction'] == 'l2r+r2l': + if len(readable_rels[(arg2, arg1)]) == 0: + readable_rels[(arg2, arg1)] = (rel[0] + pref_b, '2:' + rel[1] + ':1') + # BA existing relation + elif Rpair == true_relations[rel]: + # left-to-right + if len(readable_rels[(arg1, arg2)]) == 0: + readable_rels[(arg1, arg2)] = (rel[0] + pref_f, '2:' + rel[1] + ':1') + # right-to-left + if params['direction'] == 'neg': + readable_rels[(arg2, arg1)] = (rel[0] + pref_b, '1:Other:2') + elif params['direction'] == 'l2r+r2l': + if len(readable_rels[(arg2, arg1)]) == 0: + readable_rels[(arg2, arg1)] = (rel[0] + pref_b, '1:' + rel[1] + ':2') + else: + not_found += 1 + + # this pair does not have a relation + if not_found == total_rels: + if readable_rels[(arg1, arg2)] or readable_rels[ + (arg2, arg1)]: # if pair already there, don't do anything + continue + + rel_new_id = 'R-' + str(unk) + + # left-to-right + readable_rels[(arg1, arg2)] = (rel_new_id + pref_f, '1:Other:2') + # right-to-left + if params['direction'] != 'l2r' and (ent1 != ent2): + readable_rels[(arg2, arg1)] = (rel_new_id + pref_b, '1:Other:2') + unk += 1 + + return r_idxs, readable_rels + + +def get_rtypes(data_struct, data_struct_dev): + rel_len = [] + rels = [] + for sid in data_struct['input']: + sent = data_struct['input'][sid] + rels2 = [] + for (e1, e2) in sent['readable_r']: + if sent['readable_r'][(e1, e2)]: + rels2.append(sent['readable_r'][(e1, e2)][1]) + rels.append(rels2) + rel_len.append(len(rels2)) + + for sid in data_struct_dev['input']: + sent = data_struct_dev['input'][sid] + rels2 = [] + for (e1, e2) in sent['readable_r']: + if sent['readable_r'][(e1, e2)]: + rels2.append(sent['readable_r'][(e1, e2)][1]) + rels.append(rels2) + rel_len.append(len(rels2)) + + return rels diff --git a/loader/prepData/sentence.py b/loader/prepData/sentence.py index ee59a2f..b177e94 100644 --- a/loader/prepData/sentence.py +++ b/loader/prepData/sentence.py @@ -4,6 +4,8 @@ from collections import OrderedDict import numpy as np +from loader.prepData.relation import process_relations + def calculate_offset(sentences, i): """ @@ -33,6 +35,7 @@ def prep_sentence_offsets(sentences0): sentences_ = [] sent_words = [] words_ = [] + # chars_ = [] sentences1 = OrderedDict() sent_lens = [] for pmid in sentences0: @@ -42,14 +45,18 @@ def prep_sentence_offsets(sentences0): doc_data = [] for xx, sentence in enumerate(sentences): offsets, words = calculate_offset(sentences, xx) + # chars = ["".join([w for w in words])] + # chars2 = [[c for c in w] for w in words] sent_lens.append(len(words)) sent_words.append(words) words_.extend(words) + # chars_.extend(chars) doc_data.append({ 'sentence': sentence, 'words': words, + # 'chars': chars2, 'offsets': offsets }) @@ -62,42 +69,130 @@ def prep_sentence_offsets(sentences0): sentences2['sentences'] = sentences_ sentences2['sent_words'] = sent_words sentences2['words'] = words_ + # sentences2['chars'] = chars_ sentences2['max_sent_len'] = max_sent_len return sentences2 -def process_input(input0): +def process_input(input0, entities0, relations0, events2, params, dirpath): + emissed = 0 + for pmid in input0: sentences_data = input0[pmid] + relations_data = relations0[pmid]['data'] + events_data = events2['pmids'][pmid] + # events2_data = events2[pmid] + + # selected = [] + # abst_sents_rels = [] + unk = 0 + added_events = [] for sid, sentence in enumerate(sentences_data): eids = sentence['eids'] readable_ents = sentence['readable_ents'] + cand_pairs = OrderedDict() + for idR in relations_data: + + relation = relations_data[idR] + rol1 = 'Arg1' + rol2 = 'Arg2' + arg1 = relation['arg1id'] + arg2 = relation['arg2id'] + typeR = relation['type'] + idR = relation['id'] + p = (idR, typeR) + pair = [(rol1, arg1), (rol2, arg2)] + + if arg1 in eids and arg2 in eids: + # selected.append(p) + cand_pairs[p] = pair + + sentence['rels'] = cand_pairs + true_relations = cand_pairs + + # RELATIONS readable_entsA = OrderedDict() read_temp = OrderedDict() for ee1 in eids: if ee1.startswith('TR'): - readable_entsA[ee1] = readable_ents[ee1] + readable_entsA[ee1] = readable_ents[ee1] # triggers else: - read_temp[ee1] = readable_ents[ee1] - readable_entsB = OrderedDict() + read_temp[ee1] = readable_ents[ee1] # entities + readable_entsB = OrderedDict() # augment with triggers for trig-trig pairs readable_entsB.update(read_temp) readable_entsB.update(readable_entsA) - r_idxs = OrderedDict() + r_idxs, readable_rels = process_relations(readable_entsA, readable_entsB, readable_ents, true_relations, + unk, + params) + sentence['readable_r'] = readable_rels sentence['idx'] = r_idxs sent_evs = OrderedDict() + for idE in events_data: + event = events_data[idE] + idTR = event['trid'] + if event['args_num'] == 0: + if idTR in sentence['idx']: + event['rel'] = {} + sent_evs[idE] = event + else: + + args_data = event['args_data'] + isEvent = True + rels = OrderedDict() + for xx, arg1 in enumerate(args_data): + typeR = arg1[0] + idArg = arg1[1] + if idArg in events_data and idTR in sentence['idx']: + # argument is event + argEv = events_data[idArg] + idArg2 = argEv['trid'] + if (idTR, idArg2) in readable_rels: + rel_data = readable_rels[(idTR, idArg2)] + if typeR in rel_data[1]: + rels[(idTR, idArg2)] = [rel_data[0], typeR] + continue + else: + isEvent = False + break + else: + isEvent = False + break + + elif (idTR, idArg) in readable_rels: + rel_data = readable_rels[(idTR, idArg)] + if typeR in rel_data[1]: + rels[(idTR, idArg)] = [rel_data[0], typeR] + continue + else: + isEvent = False + break + else: + isEvent = False + break + if isEvent: + event['rel'] = rels + sent_evs[idE] = event sentence['readable_ev'] = sent_evs trigger_ev = collections.defaultdict(list) + # idEvs = OrderedDict() + for idE in sent_evs: + event = sent_evs[idE] + idTR = event['trid'] + trigger_ev[idTR].append(event) sentence['trigger_ev'] = trigger_ev + # sentence['idEvs'] = idEvs + + added_events.extend([idE for idE in sent_evs]) input1 = OrderedDict() for pmid in input0: diff --git a/loader/prepNN/ent2net.py b/loader/prepNN/ent2net.py index b6bb403..0c44bcf 100644 --- a/loader/prepNN/ent2net.py +++ b/loader/prepNN/ent2net.py @@ -2,17 +2,8 @@ import collections -from loader.prepData.entity import extract_entities, convert_to_sub_words - - -def _elem2idx(list_of_elems, map_func): - """ - :param list_of_elems: list of lists - :param map_func: mapping dictionary - :returns - list with indexed elements - """ - return [[map_func[x] for x in list_of] for list_of in list_of_elems] +from loader.prepData.entity import extract_entities, convert_to_sub_words, convert_to_sub_words_lstm +from loader.prepNN.mapping import _elem2idx def entity2network(sentence_data, words, params, tokenizer): @@ -20,7 +11,21 @@ def entity2network(sentence_data, words, params, tokenizer): tags = sentence_data['tags'] tags_terms = sentence_data['tags_terms'] - sw_sentence, sub_to_word, subwords, valid_starts = convert_to_sub_words(words, + # nner: Using subwords: + if params['predict'] and params['pipelines']: + if params['pipe_flag'] > 0: + tokenizer = None + + # if use lstm + if params['use_lstm']: + sw_sentence, sub_to_word, subwords, valid_starts = convert_to_sub_words_lstm(words, + list(map(list, zip(*tags))), + list(map(list, zip(*tags_terms))), + tokenizer=tokenizer) + + # or bert + else: + sw_sentence, sub_to_word, subwords, valid_starts = convert_to_sub_words(words, list(map(list, zip(*tags))), list(map(list, zip(*tags_terms))), tokenizer=tokenizer) @@ -34,15 +39,21 @@ def entity2network(sentence_data, words, params, tokenizer): tagsIDs = list(map(list, zip(*tagsIDs))) tagsT = [] + tagsTR = [] for tag in tagsIDs: - tagsT.append(tag) + if tag[0] in params['trTags_Ids']: + tagsTR.append(tag) + else: + tagsT.append(tag) readable_e = sentence_data['readable_ents'] idxs = sentence_data['idx'] rev_idxs = {id: ent for ent, id in idxs.items()} toks2 = [] etypes2 = [] + # ents = OrderedDict() ents = collections.defaultdict(list) + # dup_ent_tag = False for xx in range(0, len(idxs)): ent = rev_idxs[xx] @@ -57,6 +68,8 @@ def entity2network(sentence_data, words, params, tokenizer): toksid = toks[0] ents[toksid].append([ent, readable_e[ent]['offs2'], readable_e[ent]['text']]) + # fix bug for mlee + # etypes2ids = [params['mappings']['type_map'][etype] if etype in params['mappings']['type_map'] else params['mappings']['type_map']['Metabolism'] for etype in etypes2] etypes2ids = [params['mappings']['type_map'][etype] for etype in etypes2] - return readable_e, idxs, ents, toks2, etypes2ids, entities, sw_sentence, sub_to_word, subwords, valid_starts, tagsIDs, terms + return readable_e, idxs, ents, toks2, etypes2ids, entities, sw_sentence, sub_to_word, subwords, valid_starts, tagsIDs, tagsTR, terms diff --git a/loader/prepNN/ev2net.py b/loader/prepNN/ev2net.py new file mode 100644 index 0000000..10cfd12 --- /dev/null +++ b/loader/prepNN/ev2net.py @@ -0,0 +1,306 @@ +"""Prepare event data for training networks.""" + +import collections +import numpy as np +import logging + +logger = logging.getLogger(__name__) + + +def create_ev_truth(idxs, readable_e, events, params): + ev_num = 0 + ev_matched = 0 + max_ev_per_layer = 0 + + truth_ev = -1 * np.ones((len(readable_e), 2), dtype=np.object) + + truth_ev0 = collections.defaultdict(list) + type_debug = False + + for idTR in events: + xxTR = idxs[idTR] + + ev_l_0 = 0 + ev_l_1 = 0 + + for event in events[idTR]: + event['modality'] = params['mappings']['modality_map'][event['modality']] + ev_num += 1 + + typeTR = event['trtype'] + + struct = event['args_type'] + + if typeTR in params['mappings']['flat_structs_map']: + if struct in params['mappings']['flat_structs_map'][typeTR]: + ev_matched += 1 + ev_l_0 += 1 + ev_argtype = event['args_type'] + args_ids = params['mappings']['flat_structs_map'][typeTR][ev_argtype] + + rels = event['rel'] + a2ids = [] + for rel, reltype in rels.items(): + a2id = idxs[rel[1]] + a2ids.append(a2id) + + # self event + + if event['is_zeroArg']: + a2ids.append(xxTR) + + truth_ev0[(xxTR, 0)].append([args_ids, a2ids]) + + if typeTR in params['mappings']['nested_structs_map']: + if struct in params['mappings']['nested_structs_map'][typeTR]: + ev_matched += 1 + ev_l_1 += 1 + ev_argtype = event['args_type'] + args_ids = params['mappings']['nested_structs_map'][typeTR][ev_argtype] + + rels = event['rel'] + a2ids = [] + for rel, reltype in rels.items(): + a2id = idxs[rel[1]] + a2ids.append(a2id) + + truth_ev0[(xxTR, 1)].append([args_ids, a2ids]) + + max_ev_per_layer = max(ev_l_0, ev_l_1, max_ev_per_layer) + + for trid, pairs in truth_ev0.items(): + truth_ev[trid[0]][trid[1]] = pairs + + no_event = False + if len(truth_ev0) == 0: + no_event = True + + ev_missed = ev_num - ev_matched + + return truth_ev, ev_num, ev_matched, ev_missed, no_event, type_debug, max_ev_per_layer + + +def event2network(sentence_data, fid, idxs, events_map, max_ev_per_layer, readable_e, params): + # input + events = sentence_data['trigger_ev'] + + # create labels for events + truth_ev, ev_num, ev_matched, ev_missed, no_event, type_debug, max_ev_per_layer_ = create_ev_truth(idxs, + readable_e, + events, params) + + # C2T add + max_ev_per_layer = max(max_ev_per_layer_, max_ev_per_layer) + + # ev_num2 += ev_num + # ev_matched2 += ev_matched + # ev_missed2 += ev_missed + + # Add events to map: + for _, events_list in enumerate(events.items()): + for event in events_list[1]: + if fid not in events_map: + events_map[fid] = {event['id']: event} + else: + events_map[fid][event['id']] = event + + return events, truth_ev, max_ev_per_layer + + +def count_ev_truth(samples): + """Count the number of created truth events.""" + + # count total number of valid truth events + total_count_valid_evs = 0 + + # for each sentence + for sample in samples: + # get truth + truth_ev = sample['truth_ev'] + + # count the valid event truth + valid_truth_ev = truth_ev[truth_ev != -1] + count_valid_ev = sum([len(truth_list) for truth_list in valid_truth_ev]) + total_count_valid_evs += count_valid_ev + + print('Check created event truth') + print('Valid truth events: ', total_count_valid_evs) + + return + + +def gen_nn_truth_nested_ev(fid, typeTR, struct, mapping_structs, event, span_terms, ev_idx, events_map, params, + self_event=False): + try: + if typeTR in mapping_structs: + if struct in mapping_structs[typeTR]: + ev_argtype = event['args_type'] + args_ids = mapping_structs[typeTR][ev_argtype] + rels = event['rel'] + + # store entity arguments and event arguments + ent_args = [] + ev_args = [] + + if len(event['nested_events']) > 0: + nested_evs = [events_map[fid][eid] if eid in events_map[fid] else -1 for eid in + event['nested_events']] + nested_trIds = [ev['trid'] if ev != -1 else -1 for ev in nested_evs] + for rel, reltype in rels.items(): + argid = rel[1] + + # is event argument + if len(event['nested_events']) > 0: + + # is trigger + if argid in nested_trIds: + nested_ev = nested_evs[nested_trIds.index(argid)] + a2id = gen_nn_truth_nested_evs(fid, nested_ev, span_terms, events_map, params) + ev_args.append(a2id) + + # is entity + else: + a2id = span_terms.term2id[argid] + ent_args.append(a2id) + + # or flat + else: + a2id = span_terms.term2id[argid] + ent_args.append(a2id) + + # self event + if self_event: + if event['is_zeroArg']: + ent_args.append(ev_idx) + + if len(ent_args) > 0: + ent_args = collections.Counter(ent_args) + + nested_ev_level = event['nested_ev_level'] + truth_out = (nested_ev_level, args_ids, ent_args, ev_args) + return truth_out + + except (KeyError, ValueError) as err: + logger.debug(err) + return None + + +def gen_nn_truth_nested_evs(fid, nested_ev, span_terms, events_map, params): + nested_idTR = nested_ev['trid'] + nested_ev_idx = span_terms.term2id[nested_idTR] + typeTR = nested_ev['trtype'] + struct = nested_ev['args_type'] + nested_ev_present = gen_nn_truth_nested_ev(fid, typeTR, struct, params['mappings']['flat_structs_map'], nested_ev, + span_terms, nested_ev_idx, events_map, params, self_event=True) + if not nested_ev_present: + nested_ev_present = gen_nn_truth_nested_ev(fid, typeTR, struct, params['mappings']['nested_structs_map'], + nested_ev, + span_terms, nested_ev_idx, events_map, params) + + return nested_ev_present + + +def gen_nn_truth_ev(fid, truth_ev_layer, typeTR, struct, mapping_structs, event, span_terms, ev_idx, events_map, params, + self_event=False): + try: + if typeTR in mapping_structs: + if struct in mapping_structs[typeTR]: + ev_argtype = event['args_type'] + struct_ids = mapping_structs[typeTR][ev_argtype] + rels = event['rel'] + + # store entity and event arguments + ent_args_list = [] + ev_args_list = [] + + if len(event['nested_events']) > 0: + nested_evs = [events_map[fid][eid] if eid in events_map[fid] else -1 for eid in + event['nested_events']] + nested_trIds = [ev['trid'] if ev != -1 else -1 for ev in nested_evs] + for rel, reltype in rels.items(): + argid = rel[1] + if len(event['nested_events']) > 0: + + # is trigger + if argid in nested_trIds: + nested_ev = nested_evs[nested_trIds.index(argid)] + a2id = gen_nn_truth_nested_evs(fid, nested_ev, span_terms, events_map, params) + ev_args_list.append(a2id) + + # is entity + else: + a2id = span_terms.term2id[argid] + ent_args_list.append(a2id) + + # is flat + else: + a2id = span_terms.term2id[argid] + ent_args_list.append(a2id) + + # self event + if self_event: + if event['is_zeroArg']: + ent_args_list.append(ev_idx) + + mod_label = event['modality'] + if len(ent_args_list) > 0: + ent_args_list = collections.Counter(ent_args_list) + truth_out = [(struct_ids, ent_args_list, ev_args_list), mod_label] + truth_ev_layer.append(truth_out) + + except (KeyError, ValueError) as err: + logger.debug(err) + + +def gen_nn_truth_evs(fid, span_terms, events, events_map, params): + truth_ev = -1 * np.ones((len(events), params['max_ev_level'] + 1, params['max_ev_args'] + 1), dtype=np.object) + ev_lbls = -1 * np.ones((len(events)), dtype=np.object) + ev_idxs = {} + + truth_ev_dict = collections.defaultdict(list) + ev_lbls_dict = collections.defaultdict(list) + + # store list of events for each trigger id + ev_idxs_lst = [] + + for idTR in events: + if idTR in span_terms.term2id: + + # event trigger index + ev_trid = span_terms.term2id[idTR] + + ev_idxs_lst.append(ev_trid) + for i, event in enumerate(events[idTR]): + mod_label = event['modality'] + typeTR = event['trtype'] + struct = event['args_type'] + + # get the number of arguments, and nested level + arg_num = event['args_num'] + nested_ev_level = event['nested_ev_level'] + + # flat events + gen_nn_truth_ev(fid, truth_ev_dict[(ev_trid, 0, arg_num)], typeTR, struct, + params['mappings']['flat_structs_map'], event, + span_terms, ev_trid, events_map, params, self_event=True) + + # nested events + gen_nn_truth_ev(fid, truth_ev_dict[(ev_trid, nested_ev_level, arg_num)], typeTR, struct, + params['mappings']['nested_structs_map'], event, + span_terms, ev_trid, events_map, params) + ev_lbls_dict[ev_trid].append(mod_label) + + for i, ev_trid in enumerate(ev_idxs_lst): + ev_idxs[ev_trid] = i + ev_lbls[i] = ev_lbls_dict[ev_trid] + for level in range(params['max_ev_level'] + 1): + for narg in range(params['max_ev_args'] + 1): + try: + if len(truth_ev_dict[(ev_trid, level, narg)]) > 0: + truth_ev[i][level][narg] = truth_ev_dict[(ev_trid, level, narg)] + else: + truth_ev[i][level][narg] = -1 + except KeyError: + truth_ev[i][level][narg] = -1 + + return truth_ev, ev_idxs, ev_lbls diff --git a/loader/prepNN/mapping.py b/loader/prepNN/mapping.py new file mode 100644 index 0000000..0d32e3b --- /dev/null +++ b/loader/prepNN/mapping.py @@ -0,0 +1,252 @@ +"""Generate mappings""" + +import itertools +from collections import OrderedDict +from collections import Counter +import numpy as np + +from loader.prepData.entity import entity_tags +from loader.prepData.relation import get_rtypes +from loader.prepNN.structure import process_structure + + +def _generate_mapping(list_of_elems): + """ + :param list_of_elems: list of elements (single or nested) + :returns + dictionary with a unique id for each element + """ + # list of lists + elem_count = OrderedDict() + if all(isinstance(el, list) for el in list_of_elems): + for item in itertools.chain.from_iterable(list_of_elems): + if item not in elem_count: + elem_count[item] = 1 + else: + elem_count[item] += 1 + # single lists + else: + for item in list_of_elems: + if item not in elem_count: + elem_count[item] = 1 + else: + elem_count[item] += 1 + elem_count = sorted(elem_count.items(), key=lambda x: x[1]) # sort from low to high freq + mapping = OrderedDict([(elem, i) for i, (elem, val) in enumerate(elem_count)]) + rev_mapping = OrderedDict([(v, k) for k, v in mapping.items()]) + return mapping, rev_mapping, len(elem_count) + + +def _find_singletons(list_of_elems, args, min_w_freq): + """ + :param list_of_elems: list of all words in a train dataset + :returns + number of words with frequency = 1 + """ + elem_count = Counter([x for x in list_of_elems]) + unique_args = list(set(itertools.chain.from_iterable([a.split(' ') for a in args]))) + singles = [elem for elem, val in elem_count.items() if ((val <= min_w_freq) and (elem not in unique_args))] + return singles + + +def generate_map(data_struct, data_struct_dev, data_struct_test, params): # add test for mlee + + # 1. words mapping + words = data_struct['sentences']['sent_words'] + words_train = data_struct['sentences']['words'] + words.append(['']) + word_map, rev_word_map, word_size = _generate_mapping(words) + + # 2. .. + # labels of entity (in .a1) + argumentsT = data_struct['entities']['arguments'] + + # labels of trigger (in .a2) + argumentsTR = data_struct['triggers']['arguments'] + arguments = argumentsT + argumentsTR + singlesW = _find_singletons(words_train, arguments, params['min_w_freq']) + + typesTR = data_struct['terms']['typesTR'] + typesTR.extend(data_struct_dev['terms']['typesTR']) + + typesT = data_struct['terms']['typesT'] + typesT.extend(data_struct_dev['terms']['typesT']) + + # add for test: fig bug for mlee + typesTR.extend(data_struct_test['terms']['typesTR']) + typesT.extend(data_struct_test['terms']['typesT']) + + all_types = [] + for type in typesTR: + if type not in all_types: + all_types.append(type) + + for type in typesT: + if type not in all_types: + all_types.append(type) + + type_map = {type: id for id, type in enumerate(all_types)} + rev_type_map = {id: type for type, id in type_map.items()} + type_size = len(type_map) + + typeTR_map = {} + for type, id in type_map.items(): + if type in typesTR: + typeTR_map[type] = id + rev_typeTR_map = {id: type for type, id in typeTR_map.items()} + # typeTR_size = len(typeTR_map) + + rev_tag_map, tag_map, _, _ = entity_tags(rev_type_map) + + tag_size = len(tag_map) + + trTypeIds = [id for id in rev_typeTR_map] + + tagsTR = data_struct['terms']['tagsTR'] + tagsTR2 = data_struct_dev['terms']['tagsTR'] + tagsTR.extend([tag for tag in tagsTR2 if tag not in tagsTR]) + rev_tag_mapTR = {tag_map[tag]: tag for tag in tagsTR} + + tag_mapTR = {tag: id for id, tag in rev_tag_mapTR.items()} + trTagsIds = [tag for tag in rev_tag_mapTR] + + tag2type = data_struct['terms']['tags2types'] + tag2type2 = data_struct_dev['terms']['tags2types'] + for tag in tag2type2: + if tag not in tag2type: + tag2type[tag] = tag2type2[tag] + tag2type_map = OrderedDict() + for tag in tag2type: + if tag != 'O': + type = tag2type[tag] + tag2type_map[tag_map[tag]] = type_map[type] + tag2type_map[0] = -1 # tag O + + tag2type = np.zeros(tag_size, np.int32) + for tag, type in tag2type_map.items(): + tag2type[tag] = type + + # 3. pos map + all_sents = data_struct['sentences']['sentences'] + all_sents.extend(data_struct_dev['sentences']['sentences']) + + length = [len([w for w in s.split()]) for s in all_sents] + ranges = [list(map(str, list(range(-l + 1, l)))) for l in length] + if params['include_nested']: + ranges.append(['inner']) # encode nestedness embeddings + ranges.append(['outer']) + pos_map, rev_pos_map, pos_size = _generate_mapping(ranges) + + # 4. rel map + rels = get_rtypes(data_struct, data_struct_dev) + rel_map, rev_rel_map, rel_size = _generate_mapping(rels) + + # Generate relation maps with L R distinguishing + rtype_map = {'Other': -1} + rel2rtype_map = {} + for rel in rel_map: + relid = rel_map[rel] + rtype = rel.split(':')[1] + if '1:' in rel and rtype != 'Other': # ony lef to right + rtype_map[rtype] = relid + + for rel in rel_map: + relid = rel_map[rel] + rtype = rel.split(':')[1] + rtypeid = rtype_map[rtype] + rel2rtype_map[relid] = rtypeid + + rel2rtype_map2 = np.zeros((len(rel2rtype_map)), dtype=np.int32) + for rel, rtype in rel2rtype_map.items(): + rel2rtype_map2[rel] = rtype + + rev_rtype_map = {id: type for type, id in rtype_map.items()} + # rev_rtype_map[rel_size] = 'None' # for the none relation in events + + # generate mappings for event structures + flat_structs_map, nested_structs_map, flat_types_id_map, nested_types_id_map, etype_pairs = process_structure( + data_struct, data_struct_dev, params, type_map, typeTR_map, rtype_map, type_size, rel_size) + + # modality + modality_map = {'non-modality': 1, 'Speculation': 2, 'Negation': 3} + rev_modality_map = {id: type for type, id in modality_map.items()} + ev_size = len(modality_map) + + # return + params['voc_sizes'] = {'word_size': word_size, + 'etype_size': type_size, + 'tag_size': tag_size, + 'pos_size': pos_size, + 'rel_size': rel_size, + 'ev_size': ev_size + } + params['mappings'] = {'word_map': word_map, 'rev_word_map': rev_word_map, + 'type_map': type_map, 'rev_type_map': rev_type_map, + 'typeTR_map': typeTR_map, 'rev_typeTR_map': rev_typeTR_map, + 'tag_map': tag_map, 'rev_tag_map': rev_tag_map, + 'tag_mapTR': tag_mapTR, 'rev_tag_mapTR': rev_tag_mapTR, + 'tag2type_map': tag2type, + 'pos_map': pos_map, 'rev_pos_map': rev_pos_map, + 'rel_map': rel_map, 'rev_rel_map': rev_rel_map, + 'rtype_map': rtype_map, 'rev_rtype_map': rev_rtype_map, + 'rel2rtype_map': rel2rtype_map2, + 'flat_structs_map': flat_structs_map, 'flat_types_id_map': flat_types_id_map, + 'nested_structs_map': nested_structs_map, 'nested_types_id_map': nested_types_id_map, + 'modality_map': modality_map, 'rev_modality_map': rev_modality_map, + 'etype_pairs': etype_pairs + } + params['trTags_Ids'] = trTagsIds + params['trTypes_Ids'] = trTypeIds + params['words_train'] = words_train + params['singletons'] = singlesW + params['max_sent_len'] = np.maximum(data_struct['sentences']['max_sent_len'], + data_struct_dev['sentences']['max_sent_len']) + params['rtype_trig_ev'] = rel_size + + return params + + +def find_ignore_label(params): + """ + :return: + id corresponds to the "Other" relation + dictionary with directionality, e.g. relation_mapping['1:Rel:2'] = 3 + relation_mapping['2:Rel:1'] = 8 + lab_map[3] = 8, lab_map[8] = 3 + """ + lab2ign_id = params['mappings']['rel_map'][params['lab2ign']] + + # Map key of relation 1:REL:2 with 2:REL:1, else , map this key with itself, also map ignored keys + lab_map = OrderedDict() + for m, n in params['mappings']['rel_map'].items(): + for m2, n2 in params['mappings']['rel_map'].items(): + if m == m2: + continue + elif m == params['lab2ign'] or m2 == params['lab2ign']: + continue + elif m.split(':')[1] == m2.split(':')[1]: + lab_map[n] = n2 + + for m, n in params['mappings']['rel_map'].items(): + if n not in lab_map: + lab_map[n] = n + + lab_map[lab2ign_id] = lab2ign_id + params['lab_map'] = lab_map + params['lab2ign_id'] = lab2ign_id + return params + + +def _elem2idx(list_of_elems, map_func): + """ + :param list_of_elems: list of lists + :param map_func: mapping dictionary + :returns + list with indexed elements + """ + # fix bug for mlee + # return [[map_func[x] if x in map_func else map_func["O"] for x in list_of] for list_of in list_of_elems] + return [[map_func[x] for x in list_of] for list_of in list_of_elems] + + + diff --git a/loader/prepNN/prep4nn.py b/loader/prepNN/prep4nn.py index 75200c6..f488b6e 100644 --- a/loader/prepNN/prep4nn.py +++ b/loader/prepNN/prep4nn.py @@ -1,12 +1,15 @@ """Prepare data for training networks.""" +import collections from collections import OrderedDict from bert.tokenization import BertTokenizer from sklearn.preprocessing import MultiLabelBinarizer from loader.prepNN.sent2net import prep_sentences -from loader.prepNN.ent2net import entity2network, _elem2idx +from loader.prepNN.ent2net import entity2network +from loader.prepNN.ev2net import event2network +from loader.prepNN.mapping import _elem2idx from loader.prepNN.span4nn import get_nn_data @@ -21,12 +24,18 @@ def data2network(data_struct, data_type, params): all_sentences = [] + # C2T add: + max_ev_per_layer = params['max_ev_per_layer'] + # nner: Using subwords: tokenizer = BertTokenizer.from_pretrained( params['bert_model'], do_lower_case=False ) + events_map = collections.defaultdict() + for xx, sid in enumerate(data_struct['input']): + # input sentence_data = data_struct['input'][sid] @@ -34,13 +43,18 @@ def data2network(data_struct, data_type, params): fid = sid.split(':')[0] # words to ids + # words = sentence_data['words'] word_ids = wordsIDs[xx] words = org_sent_words[xx] # entity - readable_e, idxs, ents, toks2, etypes2ids, entities, sw_sentence, sub_to_word, subwords, valid_starts, tagsIDs, terms = entity2network( + readable_e, idxs, ents, toks2, etypes2ids, entities, sw_sentence, sub_to_word, subwords, valid_starts, tagsIDs, tagsTR, terms = entity2network( sentence_data, words, params, tokenizer) + # events + events, truth_ev, max_ev_per_layer = event2network(sentence_data, fid, idxs, events_map, max_ev_per_layer, + readable_e, params) + # return sentence_vector = OrderedDict() sentence_vector['fid'] = fid @@ -50,23 +64,43 @@ def data2network(data_struct, data_type, params): sentence_vector['offsets'] = sentence_data['offsets'] sentence_vector['e_ids'] = idxs sentence_vector['tags'] = tagsIDs + sentence_vector['tagsTR'] = tagsTR sentence_vector['etypes2'] = etypes2ids sentence_vector['toks2'] = toks2 sentence_vector['raw_words'] = sentence_data['words'] + sentence_vector['truth_ev'] = truth_ev + # nner sentence_vector['entities'] = entities sentence_vector['sw_sentence'] = sw_sentence sentence_vector['terms'] = terms + sentence_vector['relations'] = sentence_data['readable_r'] + sentence_vector['events'] = events sentence_vector['sub_to_word'] = sub_to_word sentence_vector['subwords'] = subwords sentence_vector['valid_starts'] = valid_starts - all_sentences.append(sentence_vector) + # ignore this sentence or not + ignore_sent = False + + # filter sentence with no entity, for training set only (contains 'train' in path) + if params['filter_no_ent_sents'] and data_type == 'train': - return all_sentences + # check number of entities in this sentence + ents_no = len(sentence_vector['e_ids']) + if ents_no == 0: + ignore_sent = True + if not ignore_sent: + all_sentences.append(sentence_vector) -def torch_data_2_network(cdata2network, params, do_get_nn_data): + # C2T add + params['max_ev_per_layer'] = max_ev_per_layer + + return all_sentences, events_map + + +def torch_data_2_network(cdata2network, events_map, params, do_get_nn_data): """ Convert object-type data to torch.tensor type data, aim to use with Pytorch """ etypes = [data['etypes2'] for data in cdata2network] @@ -76,6 +110,8 @@ def torch_data_2_network(cdata2network, params, do_get_nn_data): sw_sentences = [data['sw_sentence'] for data in cdata2network] termss = [data['terms'] for data in cdata2network] valid_startss = [data['valid_starts'] for data in cdata2network] + relationss = [data['relations'] for data in cdata2network] + eventss = [data['events'] for data in cdata2network] fids = [data['fid'] for data in cdata2network] wordss = [data['words'] for data in cdata2network] @@ -90,6 +126,7 @@ def torch_data_2_network(cdata2network, params, do_get_nn_data): # User-defined data if not params["predict"]: id_tag_mapping = params["mappings"]["nn_mapping"]["id_tag_mapping"] + trigger_ids = params["mappings"]["nn_mapping"]["trTypes_Ids"] mlb = MultiLabelBinarizer() mlb.fit([sorted(id_tag_mapping)[1:]]) # [1:] skip label O @@ -99,13 +136,23 @@ def torch_data_2_network(cdata2network, params, do_get_nn_data): params["max_span_width"] = max(params["max_entity_width"], params["max_trigger_width"]) + params["mappings"]["nn_mapping"]["full_labels"] = sorted([v for k, v in id_tag_mapping.items() if k > 0]) + params["mappings"]["nn_mapping"]["trigger_labels"] = sorted( + [v for k, v in id_tag_mapping.items() if k in trigger_ids]) + params["mappings"]["nn_mapping"]["num_triggers"] = len(params["mappings"]["nn_mapping"]["trigger_labels"]) params["mappings"]["nn_mapping"]["num_entities"] = params["mappings"]["nn_mapping"]["num_labels"] - \ params["mappings"]["nn_mapping"]["num_triggers"] if do_get_nn_data: - nn_data = get_nn_data(fids, entitiess, termss, valid_startss, sw_sentences, - tokenizer, params) + nn_data = get_nn_data(fids, entitiess, termss, valid_startss, relationss, eventss, sw_sentences, + tokenizer, events_map, + params) return {'nn_data': nn_data, 'etypes': etypes, 'fids': fids, 'words': wordss, 'offsets': offsetss, 'sub_to_words': sub_to_words, 'subwords': subwords, 'entities': entitiess} + else: + return {'termss': termss, 'relationss': relationss, 'eventss': eventss, 'sw_sentences': sw_sentences, + 'tokenizer': tokenizer, 'events_map': events_map, 'params': params, 'etypes': etypes, 'fids': fids, + 'words': wordss, 'offsets': offsetss, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'entities': entitiess} diff --git a/loader/prepNN/rel2net.py b/loader/prepNN/rel2net.py new file mode 100644 index 0000000..0039357 --- /dev/null +++ b/loader/prepNN/rel2net.py @@ -0,0 +1,23 @@ +"""Prepare relation data for networks.""" + + +def gen_nn_rel_info(span_terms, relations, params): + gtruth = {} + left = [] + right = [] + + for term_i, term_j in relations: + if term_i in span_terms.term2id and term_j in span_terms.term2id: + i = span_terms.term2id[term_i] + j = span_terms.term2id[term_j] + + rel_id = relations[(term_i, term_j)][0] + map_rel_type = params['mappings']['rel_map'][relations[(term_i, term_j)][1]] + params['statistics']['rel'][map_rel_type] = params['statistics']['rel'][map_rel_type] + 1 + gtruth[i, j] = map_rel_type + if ('_INV' not in rel_id) and (term_i != term_j): + # if it is inverse, take the index of the element + left.append(i) + right.append(j) + + return gtruth, (left, right) diff --git a/loader/prepNN/sent2net.py b/loader/prepNN/sent2net.py index 568736f..a268880 100644 --- a/loader/prepNN/sent2net.py +++ b/loader/prepNN/sent2net.py @@ -15,6 +15,7 @@ def prep_sentences(data_struct, data_type, params): # MAPPINGS singlesW = params['singletons'] words_train = params['words_train'] + # pre_words = params['pre_words'] uw_prob = params['unk_w_prob'] if data_type == 'train': singlesW = set(singlesW) @@ -40,6 +41,7 @@ def prep_sentences(data_struct, data_type, params): in_train = 0 in_pretrain = 0 nowhere = 0 + # pre_words = set(pre_words) words_train = set(words_train) for sid, s in enumerate(data_struct['sentences']): ff = [] @@ -47,7 +49,9 @@ def prep_sentences(data_struct, data_type, params): if w in words_train: in_train += 1 ff.append(w) - + # elif w in pre_words: + # in_pretrain += 1 + # ff.append(w) else: nowhere += 1 ff.append('') diff --git a/loader/prepNN/span4nn.py b/loader/prepNN/span4nn.py index e1e6e0e..ba2f0fb 100644 --- a/loader/prepNN/span4nn.py +++ b/loader/prepNN/span4nn.py @@ -3,8 +3,12 @@ import numpy as np from collections import namedtuple +from loader.prepNN.rel2net import gen_nn_rel_info +from loader.prepNN.ev2net import count_ev_truth, gen_nn_truth_evs + Term = namedtuple('Term', ['id2term', 'term2id', 'id2label']) + def get_span_index( span_start, span_end, @@ -30,8 +34,10 @@ def get_span_index( return span_index * limit + index -def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, params): +def get_batch_data(fid, entities, terms, valid_starts, relations, events, sw_sentence, tokenizer, events_map, + params): mlb = params["mappings"]["nn_mapping"]["mlb"] + num_labels = params["mappings"]["nn_mapping"]["num_labels"] max_entity_width = params["max_entity_width"] max_trigger_width = params["max_trigger_width"] @@ -49,7 +55,14 @@ def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, p tokens = tokens[:num_tokens] token_mask = token_mask[:num_tokens] - ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"]) + # use lstm + if params['use_lstm']: + tokens = [""] + tokens + [""] + ids = [0] * len(tokens) + + # or bert + else: + ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"]) token_mask = [0] + token_mask + [0] @@ -94,6 +107,8 @@ def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, p if span_start not in valid_starts or (span_end + 1) not in valid_starts: # Ensure that there is no entity label here if not (params['predict'] and (params['pipelines'] and params['pipe_flag'] != 0)): + + # TODO: temporarily comment to fix bug, check again assert (span_start, span_end) not in entities entity_mask = 0 @@ -104,7 +119,13 @@ def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, p if (span_start, span_end) in entities: span_label = entities[(span_start, span_end)] span_term = terms[(span_start, span_end)] + # check if term can create relation in gold + # for idx, term in enumerate(span_term): + # if term not in params['map_entities_without_relations']: + # span_label_match_rel = 1 + # break + # assert len(span_label) <= params["ner_label_limit"], "Found an entity having a lot of types" if len(span_label) > params["ner_label_limit"]: print('over limit span_label', span_term) @@ -113,6 +134,7 @@ def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, p sorted(zip(span_label, span_term), reverse=True)[:params["ner_label_limit"]]): span_index = get_span_index(span_start, span_end, max_span_width, num_tokens, idx, params["ner_label_limit"]) + span_terms.id2term[span_index] = term_id span_terms.term2id[term_id] = span_index @@ -128,6 +150,12 @@ def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, p entity_masks.append(entity_mask) trigger_masks.append(trigger_mask) + # relations + gtruth, l2r = gen_nn_rel_info(span_terms, relations, params) + + # events + truth_ev, ev_idxs, ev_lbls = gen_nn_truth_evs(fid, span_terms, events, events_map, params) + return { 'tokens': tokens, 'ids': ids, @@ -138,24 +166,47 @@ def get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, p 'span_labels_match_rel': span_labels_match_rel, 'entity_masks': entity_masks, 'trigger_masks': trigger_masks, - 'span_terms': span_terms + 'span_terms': span_terms, + 'gtruth': gtruth, + 'l2r': l2r, + 'truth_ev': truth_ev, + 'ev_idxs': ev_idxs, + 'ev_lbls': ev_lbls } -def get_nn_data(fids, entitiess, termss, valid_startss, sw_sentences, tokenizer, params): +def get_nn_data(fids, entitiess, termss, valid_startss, relationss, eventss, sw_sentences, tokenizer, events_map, + params): samples = [] + max_ev_per_batch = params['max_ev_per_batch'] + for idx, sw_sentence in enumerate(sw_sentences): fid = fids[idx] entities = entitiess[idx] terms = termss[idx] valid_starts = valid_startss[idx] - - sample = get_batch_data(fid, entities, terms, valid_starts, sw_sentence, tokenizer, - params) + relations = relationss[idx] + events = eventss[idx] + sample = get_batch_data(fid, entities, terms, valid_starts, relations, events, sw_sentence, tokenizer, + events_map, params) + max_ev_per_batch = max(sample['truth_ev'].shape[0], max_ev_per_batch) samples.append(sample) - all_tokens = [] + # count the number of events in truth + count_ev_truth(samples) + + print('max_ev_per_batch', max_ev_per_batch) + print('max_ev_per_layer', params['max_ev_per_layer']) + print('max_seq', params['max_seq']) + + params['max_ev_per_batch'] = max_ev_per_batch + + # for lstm + if params['use_lstm']: + all_tokens = [sample["tokens"] for sample in samples] + else: + all_tokens = [] all_ids = [sample["ids"] for sample in samples] all_token_masks = [sample["token_mask"] for sample in samples] @@ -166,6 +217,11 @@ def get_nn_data(fids, entitiess, termss, valid_startss, sw_sentences, tokenizer, all_entity_masks = [sample["entity_masks"] for sample in samples] all_trigger_masks = [sample["trigger_masks"] for sample in samples] all_span_terms = [sample["span_terms"] for sample in samples] + all_gtruth = [sample["gtruth"] for sample in samples] + all_l2r = [sample["l2r"] for sample in samples] + all_truth_ev = [sample["truth_ev"] for sample in samples] + all_ev_idxs = [sample["ev_idxs"] for sample in samples] + all_ev_lbls = [sample["ev_lbls"] for sample in samples] return { 'tokens': all_tokens, @@ -177,5 +233,10 @@ def get_nn_data(fids, entitiess, termss, valid_startss, sw_sentences, tokenizer, 'span_labels_match_rel': all_span_labels_match_rel, 'entity_masks': all_entity_masks, 'trigger_masks': all_trigger_masks, - 'span_terms': all_span_terms + 'span_terms': all_span_terms, + 'gtruth': all_gtruth, + 'l2r': all_l2r, + 'truth_ev': all_truth_ev, + 'ev_idxs': all_ev_idxs, + 'ev_lbls': all_ev_lbls } diff --git a/loader/prepNN/structure.py b/loader/prepNN/structure.py new file mode 100644 index 0000000..53cac7a --- /dev/null +++ b/loader/prepNN/structure.py @@ -0,0 +1,283 @@ +"""Process event structures.""" + +from glob import glob +import os +import json +from loguru import logger +import collections +from collections import OrderedDict +import numpy as np + +from utils import utils + + +def load_general_rules(cur_rules, params): + + num_dups = 0 + + rule_fns = glob(os.path.join(params["rule_dir"], "*.rule")) + + for rule_fn in rule_fns: + for rule_line in utils.read_lines(rule_fn): + trigger_id, args = json.loads(rule_line) + + accumulative_level = 0 + + rel_arg_pairs = [] + rel_arg_pair_strs = [] + + if args: + for relation_id, level, arg_id in args: + accumulative_level += level + + rel_arg_pairs.append([relation_id, arg_id]) + rel_arg_pair_strs.append("{}{}{}".format(relation_id, level, arg_id)) + else: + rel_arg_pairs.append([str(None), trigger_id]) + rel_arg_pair_strs.append("{}{}{}".format(str(None), 0, trigger_id)) + + rule_str = "+".join(rel_arg_pair_strs) + + if accumulative_level > 0: + if trigger_id in cur_rules["structs1"]: + if rule_str in cur_rules["structs1"][trigger_id]: + num_dups += 1 + else: + cur_rules["structs1"][trigger_id][rule_str] = [rel_arg_pairs] + else: + if trigger_id in cur_rules["structs0"]: + if rule_str in cur_rules["structs0"][trigger_id]: + num_dups += 1 + else: + cur_rules["structs0"][trigger_id][rule_str] = [rel_arg_pairs] + + logger.debug("# Event rule duplicates: {}".format(num_dups)) + + +def remove_invalid_rules(cur_rules): + for rule_structures in cur_rules.values(): + for trigger_id in rule_structures: + for rule_str in list(rule_structures[trigger_id]): + has_relations = {relation_id for relation_id, _ in rule_structures[trigger_id][rule_str][0]} + if trigger_id == "Mutation" and "Theme" not in has_relations and ( + "CSite" in has_relations or "Site" in has_relations): + del rule_structures[trigger_id][rule_str] + logger.info("Removed an invalid rule: {} {}".format(trigger_id, rule_str)) + + +def merge_struct(train_struct, dev_struct): + for trigger_id, rule_structure in dev_struct.items(): + for rule_str, args in rule_structure.items(): + + if trigger_id not in train_struct: + # this trigger not in train set, create new + train_struct[trigger_id] = OrderedDict() + + train_struct[trigger_id][rule_str] = args + + +def count_rules(train_struct): + count = 0 + for type_tr, pairs in train_struct.items(): + count += len(pairs) + return count + + +def prep_structs_mapping(structsTR, type_map, rtype_map, rel_size): + structs_types = OrderedDict() + structs_map = OrderedDict() + + max_ev_per_tr = 0 + max_rel_per_ev = 0 + + for typeTR, structs in structsTR.items(): + typeTRid = type_map[typeTR] + structs_id = [] + structs_map[typeTR] = OrderedDict() + + max_ev_per_tr = max(max_ev_per_tr, len(structs)) + + for struct, struct_data in structs.items(): + + rel_id = [] + for rel in struct_data[0]: + if rel[0] == 'None': + r2id = (rel_size, typeTRid) + rel_id.append(r2id) + else: + if rel[0] in rtype_map and rel[1] in type_map: + r2id = (rtype_map[rel[0]], type_map[rel[1]]) + rel_id.append(r2id) + + max_rel_per_ev = max(max_rel_per_ev, len(rel_id)) + + # structs_map[typeTR][struct] = rel_id + if len(rel_id) == len(struct_data[0]): + rel_id_count = collections.Counter(rel_id) + structs_map[typeTR][struct] = rel_id_count + + # check to avoid duplicate + if rel_id not in structs_id: + structs_id.append(rel_id) + structs_types[typeTR] = structs_id + + return structs_map, structs_types, max_ev_per_tr, max_rel_per_ev + + +def prep_struct_map_ids(struct_map, typeTR_map, type_size, rel_size): + """ + :param struct_map: mapping for each trigger type, there is a list of event structure, each structure is a list of arguments, each argument is a pair of relation type index, entity type index + :param typeTR_map: mapping, each trigger type is assigned with an integer + :param type_size: number of entity types + trigger types + :param typeTR_size: number of trigger types + :param rel_size: number of relation types + :return: + ev_structs_ids: array[type_size x 5], for the number of argument, each element is a list object for event structures of each trigger type + ev_structs_args: array[type_size], list of arguments (pairs of (relation type, entity type)) for each trigger type + """ + # convert event structure map into indices, size=[trigger_type_size x 5_arguments] (0 is for no argument) + ev_structs_ids = -1 * np.ones((type_size + 1, 5), dtype=np.object) + # struct_arg_map = OrderedDict() + + for typeTR, structs in struct_map.items(): + trid = typeTR_map[typeTR] + + # devide and store arguments separately by the number of arguments + structs_0arg = [] # no argument + structs_1arg = [] # 1 argument + structs_2arg = [] # 2 arguments + structs_3arg = [] # 3 arguments + structs_4arg = [] # 4 arguments + # struct_arg_map[typeTR] = OrderedDict() + + for struct in structs: + args = [] + no_arg = len(struct) + for arg in struct: + args.append(arg) + + # check if it is no arg + zero_arg = False + if no_arg == 1 and args[0][0] == rel_size: + zero_arg = True + + # convert list of arg to counter: compare easier + args = collections.Counter(args) + + # store args to map + # struct_arg_map[typeTR][struct] = args + + # check if there is one argument: + if no_arg == 1: + + # it can be no argument (argument with the relation type is OTHER) + if zero_arg: + if args not in structs_0arg: + structs_0arg.append(args) + + # or it can be one arguments + else: + if args not in structs_1arg: + structs_1arg.append(args) + + # otherwise: 2, 3, 4 arguments + elif no_arg == 2: + if args not in structs_2arg: + structs_2arg.append(args) + elif no_arg == 3: + if args not in structs_3arg: + structs_3arg.append(args) + elif no_arg == 4: + if args not in structs_4arg: + structs_4arg.append(args) + + # store event structures, for each trigger type id, and for each number of argument: 0..4 + if len(structs_0arg) > 0: + ev_structs_ids[trid][0] = structs_0arg + if len(structs_1arg) > 0: + ev_structs_ids[trid][1] = structs_1arg + if len(structs_2arg) > 0: + ev_structs_ids[trid][2] = structs_2arg + if len(structs_3arg) > 0: + ev_structs_ids[trid][3] = structs_3arg + if len(structs_4arg) > 0: + ev_structs_ids[trid][4] = structs_4arg + + return ev_structs_ids + + +def prep_pair_mapping(structsTR, type_map): + etype_pairs = collections.defaultdict(set) + for _, struct_level in structsTR.items(): + for typeTR, argStructs in struct_level.items(): + for _, argStruct in argStructs.items(): + for argPair in argStruct[0]: + typeT = argPair[1] + # pair_map[typeTR].add(typeT) + if typeT in type_map and typeTR in type_map: + etype_pairs[type_map[typeTR]].add(type_map[typeT]) + + return etype_pairs + + +def prep_pair_mapping_from_file(entity_pairs, type_map): + with open(entity_pairs, 'r') as stream: + entity_pairs = utils._ordered_load(stream) + + etype_pairs = collections.defaultdict(set) + for e, p in entity_pairs.items(): + try: + etype = type_map[e] + ps = p.split(',') + for paired_e in ps: + try: + paired_etype = type_map[paired_e] + etype_pairs[etype].add(paired_etype) + except: + pass + except: + pass + + return etype_pairs + + +def process_structure(data_struct, data_struct_dev, params, type_map, typeTR_map, rtype_map, type_size, rel_size): + structs_tr = data_struct['structsTR'] + structs_tr_dev = data_struct_dev['structsTR'] + + if params['use_dev_rule']: + merge_struct(structs_tr['structs0'], structs_tr_dev['structs0']) + merge_struct(structs_tr['structs1'], structs_tr_dev['structs1']) + + if params['use_general_rule']: + load_general_rules(structs_tr, params) + remove_invalid_rules(structs_tr) + + print('Total FLAT rules', count_rules(structs_tr['structs0'])) + print('Total NESTED rules', count_rules(structs_tr['structs1'])) + + # convert structure mapping into indices + flat_structs_map, flat_types_map, max_ev_per_tr0, max_rel_per_ev0 = prep_structs_mapping(structs_tr['structs0'], + type_map, rtype_map, + rel_size) + nested_structs_map, nested_types_map, max_ev_per_tr1, max_rel_per_ev1 = prep_structs_mapping(structs_tr['structs1'], + type_map, rtype_map, + rel_size) + + # create event structures for flat and nested events + flat_types_id_map = prep_struct_map_ids(flat_types_map, typeTR_map, type_size, rel_size) + nested_types_id_map = prep_struct_map_ids(nested_types_map, typeTR_map, type_size, rel_size) + + params['max_ev_per_tr'] = max(max_ev_per_tr0, max_ev_per_tr1, params['max_ev_per_tr']) + params['max_rel_per_ev'] = max(max_rel_per_ev0, max_rel_per_ev1, params['max_rel_per_ev']) + params['max_rel_per_ev'] += 1 + + print('max_ev_per_tr', params['max_ev_per_tr']) + print('max_rel_per_ev', params['max_rel_per_ev']) + + etype_pairs = prep_pair_mapping(structs_tr, type_map) + + # if params['using_entity_pairs_filter']: + # etype_pairs = prep_pair_mapping_from_file(params['entity_pairs'], type_map) + + return flat_structs_map, nested_structs_map, flat_types_id_map, nested_types_id_map, etype_pairs diff --git a/nets/EVGen.py b/model/EVGen.py similarity index 85% rename from nets/EVGen.py rename to model/EVGen.py index bdc413f..3e84ad8 100644 --- a/nets/EVGen.py +++ b/model/EVGen.py @@ -15,6 +15,96 @@ def __init__(self, params): # parameters self.params = params + def show_input(self, etypes, l2r, rpred_types, rpred_ids, ev_idx, ev_truth, ev_lbls): + """For debug, convert indices to real events.""" + + # print relation data + print('No., batch id, a1id, a2id, r-type') + for xx, rid in enumerate(rpred_ids): + # indices + bid = l2r[0][rid] + a1id = l2r[1][rid] + a2id = l2r[2][rid] + + # rtype + rtypeid = rpred_types[rid] + rtype = self.params['mappings']['rev_rtype_map'][rtypeid] + + # entities + a1 = self.params['debugs']['terms_map'][bid, a1id] + a2 = self.params['debugs']['terms_map'][bid, a2id] + + # show + print(xx, bid, a1id, a2id, rtype, a1, a2) + + return + + def show_rels_group(self, rels_group): + """For debug.""" + xx = 0 + for trid, rel_group in rels_group.items(): + + # store a list of (rtype-etype) + rels = [] + for rel_data in rel_group: + a1typeid = rel_data[0] + a2ids = rel_data[3] + rtypeid = rel_data[2][0] + a2typeid = rel_data[2][1] + rels.append(rel_data[2]) + + bid = trid[0] + a1id = trid[1] + a2id = a2ids[1] + a1type = self.params['mappings']['rev_type_map'][a1typeid] + a2type = self.params['mappings']['rev_type_map'][a2typeid] + a1 = self.params['debugs']['terms_map'][bid, a1id] + a2 = self.params['debugs']['terms_map'][bid, a2id] + rtype = self.params['mappings']['rev_rtype_map'][rtypeid] + xx += 1 + + print(xx, trid, a1, a1type, a2, a2type, rtype) + print(trid, rels) + print() + + def deb_generated_candidates(self, ev_st_candidates, etypes, ev_cand_triggers): + """For debug only, check the output of generated event structure candidates.""" + + # translate each candidate + # format: [0=trig_id, 1-ev-structure-counter, 2-ev-structure-order, 3-ev_label, 4=modality label, 5=[list IN/OUT ids] ] + for xx, ev_cand in enumerate(ev_st_candidates): + trid = ev_cand[0] + rel_group = ev_cand[1] + rel_group_list = ev_cand[2] + ev_label = ev_cand[3] + # mod_label = ev_cand[4] + # io_ids = ev_cand[5] + + bid = trid[0] + a1id = trid[1] + a1 = self.params['debugs']['terms_map'][bid, a1id] + a1typeid = etypes[(bid, a1id)].item() + a1type = self.params['mappings']['rev_type_map'][a1typeid] + + # trigger and trigger type + print(xx, a1, a1type, 'ev label=', ev_label) + + # arguments structure + rel_group_vals = list(rel_group.elements()) + for arg_ in rel_group_vals: + rtypeid = arg_[0] + a2typeid = arg_[1] + a2type = self.params['mappings']['rev_type_map'][a2typeid] + if rtypeid == self.params['voc_sizes']['rel_size']: + rtype = 'None' + else: + rtype = self.params['mappings']['rev_rtype_map'][rtypeid] + print('(', rtype, a2type, ')') + + print() + + return + def group_rels(self, l2r, rpred_types, rpred_ids, etypes): """For generating event candidates.""" @@ -48,13 +138,21 @@ def group_rels(self, l2r, rpred_types, rpred_ids, etypes): [a2typeid, rid.item(), (rtypeid, a1typeid), (bid.item(), a1id.item())]) # if both a1 and a2 are trigger: this can be for nested events + # the direction can be reverse later to make sure having enough candidates: TODO if a1typeid in self.params['trTypes_Ids'] and a2typeid in self.params['trTypes_Ids']: nest_rels_group[(bid.item(), a1id.item())].append( [a1typeid, rid.item(), (rtypeid, a2typeid), (bid.item(), a2id.item())]) + # show output for debug only + # TODO: comment when training + # print('print flat rels_group') + # self.show_rels_group(flat_rels_group) + # print('print nested rels_group') + # self.show_rels_group(nest_rels_group) + return flat_rels_group, nest_rels_group - def add_no_arg_trigger(self, tr_ids, etypes, flat_structs_map): + def add_no_arg_trigger(self, tr_ids, etypes, ev_idx, ev_truth, ev_lbls, flat_structs_map): """Add no-argument triggers.""" # store in a map: key is trigger id, value is a pair of (rtype, trigger type); rtype is a special type @@ -64,8 +162,22 @@ def add_no_arg_trigger(self, tr_ids, etypes, flat_structs_map): for trid_ in tr_ids: trid = (trid_[0].item(), trid_[1].item()) - truth = [-1] - mod_label = [-1] + # process truth and labels + bid = trid[0] + a1id = trid[1] + truth_idx = ev_idx[bid].get(a1id, -1) + if truth_idx != -1: + truth = ev_truth[bid][truth_idx] + mod_label = ev_lbls[bid][truth_idx] # modality: 1-non-modality, 2-speculation, 3-negation + + # truth for flat and nested + # flat_truth = truth[0] + # nest_truth = truth[1] + truth = truth[0] # for flat. No-argument is always flat. + + else: + truth = [-1] + mod_label = [-1] # rtype and trigger type rtype = self.params['voc_sizes']['rel_size'] @@ -80,16 +192,33 @@ def add_no_arg_trigger(self, tr_ids, etypes, flat_structs_map): return no_arg_group - def add_truth_to_trigger(self, rels_group, structs_map, levelid=0): + def add_truth_to_trigger(self, rels_group, ev_idx, ev_truth, ev_lbls, structs_map, levelid=0): """For generating event candidates. # add event truth and labels to each trigger # levelid = 0: flat, levelid=1: nested events """ + # TODO: nested for trid, rel_group in rels_group.items(): + # get index + bid = trid[0] + a1id = trid[1] - truth = -1 * np.ones((self.params['max_ev_level'] + 1, self.params['max_ev_args'] + 1), dtype=np.object) - mod_label = [-1] + # process truth and labels + truth_idx = ev_idx[bid].get(a1id, -1) + if truth_idx != -1: + truth = ev_truth[bid][truth_idx] + mod_label = ev_lbls[bid][truth_idx] # modality: 1-non-modality, 2-speculation, 3-negation + + # truth for flat and nested + # flat_truth = truth[0] + # nest_truth = truth[1] # TODO + # truth = truth[levelid:] # for flat + + else: + # level_truth = [-1] + truth = -1 * np.ones((self.params['max_ev_level'] + 1, self.params['max_ev_args'] + 1), dtype=np.object) + mod_label = [-1] if levelid == 0: level_truth = truth[levelid] @@ -175,6 +304,7 @@ def create_multiple_flat_arg_candidates(self, trid, rels_group, args_list, n_arg # generate all possible combinations among arguments with limited by the maximum number of args max_n_args = self.params['max_ev_args'] + # TODO: now fix as 4 arguments, can revise to set in parameter later. for xx1, arg1_ in enumerate(args_list): @@ -331,6 +461,7 @@ def create_multiple_nest_arg_candidates(self, trid, rels_group, args_list, n_arg # generate all possible combinations among arguments with limited by the maximum number of args max_n_args = self.params['max_ev_args'] + # TODO: now fix as 4 arguments, can revise to set in parameter later. for xx1, arg1_ in enumerate(args_list): @@ -779,6 +910,9 @@ def add_nest_arguments(self, nest_rels_group, ev_flat_arg_ids4nn, flat_rels_grou Add reverse trigger pair if necessary. """ + # store the list of trigger and entity arguments + # nest_args = collections.OrderedDict() + # store new reversed arguments in a new dictionary rev_nest_rels_group = collections.defaultdict(list) @@ -804,7 +938,9 @@ def add_nest_arguments(self, nest_rels_group, ev_flat_arg_ids4nn, flat_rels_grou args_list.append(rel_group) # if this argument not in flat candidates, this is not a candidate, we can reverse + # TODO: or we always reverse? else: + # if argid not in ev_flat_arg_ids4nn: # create the reverse data rev_arg = [] @@ -818,13 +954,32 @@ def add_nest_arguments(self, nest_rels_group, ev_flat_arg_ids4nn, flat_rels_grou rev_args_list.append((argid, rev_arg)) + # add as a new candidate + # if argid in nest_rels_group: + # arg_list = [a2id[3] for a2id in nest_rels_group[argid]] + + # check if this argument already exists + # if trid not in arg_list: + # nest_rels_group[argid].append(rev_arg) + # print('REVERSE TRIGGER PAIRS FOR NESTED EVENT.') + # else: + # add the reverse pair + # if argid in rev_nest_rels_group: + # rev_nest_rels_group[argid].append(rev_arg) + # else: + # rev_nest_rels_group[argid] = [rev_arg] + # print('REVERSE TRIGGER PAIRS FOR NESTED EVENT.') # the reverse also not in flat events else: no_ev_list.append([trid, rel_group]) no_ev_list.append([argid, rev_arg]) + # both directions have no flat event + # else: + # print('INVALID NESTED EVENT CANDIDATE: NO FLAT EVENT TO THE TRIGGER ARGUMENT.') + # add to the map if len(args_list) > 0: if trid in rev_nest_rels_group: @@ -880,10 +1035,13 @@ def add_nest_arguments(self, nest_rels_group, ev_flat_arg_ids4nn, flat_rels_grou ent_arg_data.append([[[a2id, (-1, -1)]]]) rel_groups.extend(ent_args_list) + # store argument list for trigger + # nest_args[trid] = [trig_args, ent_args_list] + return rev_nest_rels_group - def generate_event_candidate_structures(self, etypes, tr_ids, l2r, rpred_types, rpred_ids - ): + def generate_event_candidate_structures(self, etypes, tr_ids, l2r, rpred_types, rpred_ids, ev_idx, ev_truth, + ev_lbls): """ Generate event candidates structures. - Given a list of predicted/gold entities, triggers, relations - Given a set of EVENT STRUCTURES (rules by annotation), separated by event type (also trigger type) @@ -909,16 +1067,19 @@ def generate_event_candidate_structures(self, etypes, tr_ids, l2r, rpred_types, flat_structs_map = self.params['mappings']['flat_types_id_map'] nest_structs_map = self.params['mappings']['nested_types_id_map'] + # show input: for debug only; to convert data indices to real entities, events .., it is difficult to check by indices + # self.show_input(etypes, l2r, rpred_types, rpred_ids, ev_idx, ev_truth, ev_lbls) + # group rels for each trigger: one for flat and one for nested events flat_rels_group, nest_rels_group = self.group_rels(l2r, rpred_types, rpred_ids, etypes) # add truth, labels, and event structure to each trigger # the mapping: key=trigger id, values = a list of[ [list of relations], [truth, label, ev-structures] ] - ev_flat_cand_triggers = self.add_truth_to_trigger(flat_rels_group, flat_structs_map, + ev_flat_cand_triggers = self.add_truth_to_trigger(flat_rels_group, ev_idx, ev_truth, ev_lbls, flat_structs_map, levelid=0) # prepare for no argument candidates - ev_no_arg_cand_triggers = self.add_no_arg_trigger(tr_ids, etypes, flat_structs_map) + ev_no_arg_cand_triggers = self.add_no_arg_trigger(tr_ids, etypes, ev_idx, ev_truth, ev_lbls, flat_structs_map) # create flat event candidates using event structures ev_flat_st_candidates, ev_flat_arg_ids4nn = self.create_ev_candidates(ev_flat_cand_triggers, @@ -929,23 +1090,26 @@ def generate_event_candidate_structures(self, etypes, tr_ids, l2r, rpred_types, rev_nest_rels_group = self.add_nest_arguments(nest_rels_group, ev_flat_arg_ids4nn, flat_rels_group) # add truth: do it later after flat prediction - ev_nest_cand_triggers = self.add_truth_to_trigger(rev_nest_rels_group, + ev_nest_cand_triggers = self.add_truth_to_trigger(rev_nest_rels_group, ev_idx, ev_truth, ev_lbls, nest_structs_map, levelid=1) + # for debug only, show generated candidates, TODO: comment when training + # _ = self.deb_generated_candidates(ev_st_candidates, etypes, ev_cand_triggers) + # prepare for creating embeddings from event structure candidates ev_flat_cands_ids4nn = self.prepare4nn(ev_flat_st_candidates) return {'ev_cand_ids4nn': ev_flat_cands_ids4nn, 'ev_arg_ids4nn': ev_flat_arg_ids4nn, 'ev_nest_cand_triggers': ev_nest_cand_triggers} - def _generate(self, etypes, tr_ids, l2r, rpred_types, rpred_ids): + def _generate(self, etypes, tr_ids, l2r, rpred_types, rpred_ids, ev_idx, ev_truth, ev_lbls): """Generate event candidates indices for creating embeddings.""" # a map with two output: # 1-event candidate indices: a list of event candidate, [trigger id, event label, modality label, in/out ids] # 2-event argument indices for each trigger: a map (key: trigger id, values: ids of relations and entity arguments) - ev_ids4nn = self.generate_event_candidate_structures(etypes, tr_ids, l2r, rpred_types, rpred_ids - ) + ev_ids4nn = self.generate_event_candidate_structures(etypes, tr_ids, l2r, rpred_types, rpred_ids, ev_idx, + ev_truth, ev_lbls) return ev_ids4nn @@ -965,6 +1129,9 @@ def select_nest_arguments(self, nest_group_rels, flat_pos_tr_ids, flat_pos_truth # get arguments args_list = args_data[:-1] + # truth for this trigger + # rels_group = nest_group_rels[trid] + # check whether the trigger argument ids included in the predicted positive tr_ids for trig_arg_data in args_list: @@ -974,8 +1141,13 @@ def select_nest_arguments(self, nest_group_rels, flat_pos_tr_ids, flat_pos_truth # store which event id will replace trigger argument, and its truth posid_list = [] + # truth_list = [] + + # store positive ids by level + # pos_level_list = [] # check all possible appearance of this trigger in the predicted events + # TODO: Replace for loop by a better function to find all matched indices in a list for posid, pos_trid in enumerate(flat_pos_tr_ids): if argid == pos_trid: pos_truth = flat_pos_truth_ids[posid] @@ -983,16 +1155,31 @@ def select_nest_arguments(self, nest_group_rels, flat_pos_tr_ids, flat_pos_truth # only add positive truth for training if pos_truth != -1 or not self.training: # positive id: index of (level, event id) + # posid_ = (current_nested_level, posid) posid_list.append([pos_truth, (current_nested_level, posid)]) + # truth_list.append(flat_pos_truth_ids[posid]) + # add to the list of arguments by level: 4th element trig_arg_data[4].append(posid_list) # if there is predicted events if len(posid_list) > 0: + # add by level + # pos_level_list.append([truth_list, posid_list]) + + # add to the list of arguments: for only one nested level + # trig_arg_data.append(truth_list) + # trig_arg_data.append(posid_list) + + # add to the list of arguments by level: 4th element + # trig_arg_data[4].append(posid_list) + # mark this is used to make the next level nested candidate trig_arg_data[5] = 1 + # is_new_ev = True + # pos_trig_args_list.append(trig_arg_data) # otherwise: mark this event argument is not used to search for next level nested candidates else: @@ -1036,6 +1223,10 @@ def generate_candidate_output(self, trid, rel_group_counter, rel_group_list, arg matched_truth = (0, truth_[0][0], truth_[0][1], truth_[0][2]) break + # truth for negative label + # if matched_truth == -1: + # matched_truth = (0, rel_group_counter, cand_eids_count, []) + # store the output cand_output.append(trid) cand_output.append(rel_group_counter) @@ -1055,6 +1246,10 @@ def generate_nest_candidate_output(self, trid, rel_group_counter, rel_group_list # format: [0=trig_id, 1-ev-structure-counter, 2-ev-structure-order, 3-ev_label, 4=modality label, 5=[list IN/OUT ids] ] cand_output = [] + # TODO: comment this + # convert ids to Counter to compare + # cand_eids_count = collections.Counter(a2_ids) + ev_label = 0 mod_label = 1 diff --git a/nets/EVNet.py b/model/EVNet.py similarity index 80% rename from nets/EVNet.py rename to model/EVNet.py index 270e657..440b3b2 100644 --- a/nets/EVNet.py +++ b/model/EVNet.py @@ -5,24 +5,19 @@ import collections import torch +from torch.autograd import Variable from torch import nn import torch.nn.functional as F -cpu_device = torch.device("cpu") - -# use gelu instead of relu activation function -import math - - -def gelu(x): - return 0.5 * x * (1 + torch.tanh(math.sqrt(math.pi / 2) * (x + 0.044715 * x ** 3))) +from utils.utils import gelu +cpu_device = torch.device("cpu") -from nets.EVGen import EV_Generator +from model.EVGen import EV_Generator class EVModel(nn.Module): - """CLASS FOR EVENT LAYERS.""" + """Event layer.""" def __init__(self, params, sizes): super(EVModel, self).__init__() @@ -32,7 +27,7 @@ def __init__(self, params, sizes): # dimensions if params['ner_reduce'] == False: - ent_dim = params['bert_dim'] * 3 + params['etype_dim'] # no reduce + ent_dim = params['bert_dim'] * 3 + params['etype_dim'] else: ent_dim = params['ner_reduced_size'] + params['etype_dim'] rel_dim = params['rel_reduced_size'] + params['rtype_dim'] + ent_dim @@ -61,23 +56,74 @@ def __init__(self, params, sizes): # predict modality self.modality_layer = nn.Linear(in_features=params['ev_reduced_size'], out_features=sizes['ev_size']) + # for ev loss + self.bce_with_logits_loss = nn.BCEWithLogitsLoss() + # others self.device = params['device'] def get_rel_input(self, rel_preds): """Read relation input.""" - l2r = rel_preds['pairs_idx'] - rpreds_ = rel_preds['preds'].data + # span indices for events + # training mode + if not self.params['predict']: + if self.training and self.params['use_gold_ner']: + gold_span = True + # train event only + elif not self.training and self.params['skip_ner'] and self.params['skip_rel'] and self.params[ + 'use_gold_ner']: + gold_span = True + else: + gold_span = False - # mapping relation type for 'OTHER' type to -1 - rpred_types = self.params['mappings']['rel2rtype_map'][rpreds_] + # predict mode + else: + if self.params['predict'] and (self.params['gold_eval'] or self.params['pipelines']): + gold_span = True + else: + gold_span = False - # extract only relation type != 'OTHER' (valid relations) - rpred_ids = (rpreds_ != self.params['voc_sizes']['rel_size'] - 1).nonzero().transpose(0, 1)[0] - rpred_ids = rpred_ids.to(cpu_device) # list: contain indices of the valid relations + # span indices + if gold_span: + span_indices = rel_preds['l2r'] + else: + span_indices = rel_preds['pairs_idx'] + + # relation indices for events + # training mode + if not self.params['predict']: + if self.training and self.params['use_gold_rel']: + gold_rel = True + + # train event only + elif not self.training and self.params['skip_ner'] and self.params['skip_rel'] and self.params[ + 'use_gold_rel']: + gold_rel = True + else: + gold_rel = False - return l2r, rpred_types, rpred_ids + # predict mode + else: + if self.params['predict'] and (self.params['gold_eval'] or self.params['pipelines']): + gold_rel = True + else: + gold_rel = False + + # relation indices + if gold_rel: + r_indices = rel_preds['truth'].data + else: + r_indices = rel_preds['preds'].data + + # relation type; non-relation to -1 + r_types = self.params['mappings']['rel2rtype_map'][r_indices] + + # extract positive relations, ignore non-relation + rpos_indices = (r_indices != self.params['voc_sizes']['rel_size'] - 1).nonzero().transpose(0, 1)[0] + rpos_indices = rpos_indices.to(cpu_device) + + return span_indices, r_types, rpos_indices def rtype_embedding_layer(self, rtype_): """Relation type embeddings.""" @@ -130,6 +176,7 @@ def get_arg_embeds(self, ent_embeds, rel_embeds, rtype_embeds, ev_arg_ids4nn): a2_embeds = ent_embeds[(a2ids_[0], a2ids_[1])] rt_embeds = rtype_embeds[rids] + args_embeds = torch.cat([r_embeds, rt_embeds, a2_embeds], dim=-1) # [number of arguments, rdim+rtypedim+edim] @@ -164,6 +211,7 @@ def event_representation(self, arg_embed_triggers, ev_cand_ids4nn, no_rel_type_e # no-argument if len(ev_struct[1]) == 0: + # arg_embed = concat[rel_embed, rel_type_embed, argument_embed] # since there is no argument, rel_embed is set as zeros no_rel_emb = torch.zeros((self.params['rel_reduced_size']), dtype=no_rel_type_embed.dtype, device=self.device) @@ -214,8 +262,12 @@ def event_representation(self, arg_embed_triggers, ev_cand_ids4nn, no_rel_type_e args_embeds_list.append(reduced_arg_embed) # calculate argument embed: by sum up all arguments or average, etc + # TODO: currently, use SUM args_embed = torch.sum(torch.stack(args_embeds_list, dim=0), dim=0) + # TODO: average + # args_embed = torch.mean(torch.stack(args_embeds_list, dim=0),dim=0) + # event embed: concatenate trigger embed and argument embed ev_embeds_.append(torch.cat([tr_embed, args_embed], dim=-1)) @@ -313,6 +365,7 @@ def event_nest_representation(self, arg_embed_triggers, ev_cand_ids4nn, no_rel_t # no-argument if len(ev_struct[1]) == 0: + # arg_embed = concat[rel_embed, rel_type_embed, argument_embed] # since there is no argument, rel_embed is set as zeros no_rel_emb = torch.zeros((self.params['rel_reduced_size']), dtype=no_rel_type_embed.dtype, device=self.device) @@ -363,6 +416,7 @@ def event_nest_representation(self, arg_embed_triggers, ev_cand_ids4nn, no_rel_t for xx2, inid in enumerate(io_ids): if inid == ioid: pid = pos_ids[xx2] + # pid = pos_ids[io_ids.index(ioid)] # entity argument if pid == (-1, -1): @@ -394,8 +448,12 @@ def event_nest_representation(self, arg_embed_triggers, ev_cand_ids4nn, no_rel_t # args_embeds_list.append(reduced_arg_embed) # calculate argument embed: by sum up all arguments or average, etc + # TODO: currently, use SUM args_embed = torch.sum(torch.stack(args_embeds_list, dim=0), dim=0) + # TODO: average + # args_embed = torch.mean(torch.stack(args_embeds_list, dim=0),dim=0) + # event embed: concatenate trigger embed and argument embed ev_embeds_.append(torch.cat([tr_embed, args_embed], dim=-1)) @@ -414,6 +472,7 @@ def predict(self, event_embeds): threshold = self.params['ev_threshold'] + event4class = gelu(self.hidden_layer1(event_embeds)) event4class = gelu(self.hidden_layer2(event4class)) prediction = self.l_class(event4class) @@ -431,11 +490,22 @@ def predict(self, event_embeds): prediction = prediction.flatten() - # return prediction, modality_pred, positive_idx, positive_ev # revise return event4class, prediction, positive_idx, positive_ev_embs + def calculate_ev_loss(self, prediction, ev_labels_): + """Loss.""" + + ev_labels = np.vstack(ev_labels_).ravel() + + positive_labels = ev_labels.copy() + positive_labels[positive_labels > 0] = 1 + ev_loss = self.bce_with_logits_loss(prediction, + torch.tensor(positive_labels, dtype=prediction.dtype, device=self.device)) + + return ev_loss + def predict_modality(self, positive_ev_embs, positive_ev_idx, mod_labels_): - """Predict modality, return modality predictions.""" + """Predict modality, return modality predictions and loss.""" # get labels mod_labels = np.vstack(mod_labels_).ravel() @@ -444,7 +514,7 @@ def predict_modality(self, positive_ev_embs, positive_ev_idx, mod_labels_): possitive_lbl = torch.tensor((mod_labels[positive_ev_idx] - 1), dtype=torch.long, device=self.device) - # prediction + # prediction and loss if possitive_lbl[possitive_lbl >= 0].shape[0] > 0: # prediction @@ -455,11 +525,18 @@ def predict_modality(self, positive_ev_embs, positive_ev_idx, mod_labels_): modality_pred = F.softmax(torch.tensor(modality_pred), dim=-1).data mod_preds = modality_pred.argmax(dim=-1) + # loss + modality_lbls = possitive_lbl[possitive_lbl >= 0] + mod_loss = F.cross_entropy(modality_preds, modality_lbls) + + # TODO: for debug only, remember to commend; modality pred=gold + # mod_preds = modality_lbls else: mod_preds = [] + mod_loss = 0 - return mod_preds + return mod_preds, mod_loss def create_output(self, all_ev_preds): """Create output for writing events.""" @@ -537,9 +614,9 @@ def create_output(self, all_ev_preds): return all_ev_output - def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): + def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch): """ - Create embeddings, prediction. + Create embeddings, prediction, loss. :param ent_embeds: [batch x a1id x embeds] :param rel_embeds: [rids x embeds] @@ -554,16 +631,26 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): + list of rids + list of argument ids - :return: prediction + :return: prediction, loss """ # store output all_preds_output = [] - enable_nested_ev = True - enable_modality = True + # flag to train nested event or not + if n_epoch >= self.params['ev_nested_epoch'] or self.params['predict']: + enable_nested_ev = True + else: + enable_nested_ev = False + + # flag to train modality or not + if n_epoch >= self.params['modality_epoch'] or self.params['predict']: + enable_modality = True + else: + enable_modality = False # store all predictions for flat and nested, maximum as 3 nested levels + # TODO: revise the maximum nested level later. Now fix 3 levels # positive ids: the current predicted indices; tr_ids: trigger indices of the candidate list all_positive_ids = -1 * np.ones((self.params['max_ev_level'] + 1), dtype=np.object) all_positive_tr_ids = -1 * np.ones((self.params['max_ev_level'] + 1), dtype=np.object) @@ -589,12 +676,13 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): # positive_ev_embs: embedding of predicted events: using for the next nested level event4class, prediction, positive_idx, positive_ev_embs = self.predict(ev_embeds) - empty_pred = True + # 6-ev loss + flat_ev_loss = self.calculate_ev_loss(prediction, ev_flat_cand_ids4nn['ev_labels_']) # for modality if enable_modality: - mod_preds = self.predict_modality(positive_ev_embs, positive_idx, - ev_flat_cand_ids4nn['mod_labels_']) + mod_preds, mod_losses = self.predict_modality(positive_ev_embs, positive_idx, + ev_flat_cand_ids4nn['mod_labels_']) else: mod_preds = [] @@ -611,6 +699,9 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): # for output all_preds_output.append([ev_flat_cand_ids4nn, ev_flat_arg_ids4nn, current_positive_ids, mod_preds]) + # nested loss + nest_ev_loss = 0 + # loop until stop nested event prediction or no more events predicted, or in limited nested levels while enable_nested_ev and len(current_positive_ids) > 0 and current_nested_level < self.params['max_ev_level']: @@ -625,6 +716,7 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): all_positive_ev_embs.append(reduced_ev_emb) # generate nested candidate indices + # 'ev_nest_cand_ids': ev_nest_cands_ids4nn, 'ev_nest_arg_ids4nn': ev_nest_arg_ids4nn ev_nest_ids4nn = self.ev_struct_generator._generate_nested_candidates(current_nested_level, ev_nest_cand_triggers, current_positive_tr_ids, @@ -637,8 +729,6 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): current_tr_ids = ev_nest_cand_ids4nn['trids_'] current_truth_ids = ev_nest_cand_ids4nn['truth_ids_'] - empty_pred = False - # check non-empty if len(ev_nest_cand_ids4nn['trids_']) > 0: @@ -654,10 +744,14 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): # prediction event4class, prediction, positive_idx, positive_ev_embs = self.predict(ev_embeds) + # ev loss + nest_ev_loss += self.calculate_ev_loss(prediction, ev_nest_cand_ids4nn['ev_labels_']) + # for modality if enable_modality: - mod_preds = self.predict_modality(positive_ev_embs, positive_idx, - ev_nest_cand_ids4nn['mod_labels_']) + mod_preds, mod_loss = self.predict_modality(positive_ev_embs, positive_idx, + ev_nest_cand_ids4nn['mod_labels_']) + mod_losses += mod_loss else: mod_preds = [] @@ -677,16 +771,27 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn): # 7-create output for writing events pred_ev_output = self.create_output(all_preds_output) - return pred_ev_output, empty_pred + # scale loss: if flat is stable, focus more on nested + if current_nested_level == 0: + ev_loss = flat_ev_loss + else: + ev_loss = flat_ev_loss * self.params['flat_ev_scale'] + nest_ev_loss * self.params['nest_ev_scale'] + + # add modality loss + if enable_modality: + ev_loss = ev_loss + mod_losses * self.params['modality_weight'] + + return pred_ev_output, ev_loss - def forward(self, ner_preds, rel_preds): + def forward(self, ner_preds, rel_preds, n_epoch): """Forward. - Given entities and relations, event structures, return event prediction. + Given entities and relations, event structures, return event prediction and loss. """ # check empty relation prediction if len(rel_preds['preds'].data) == 0: - ev_preds = None - empty_pred = True + # ev_out = None + # ev_loss = Variable(torch.zeros(1, device=self.device)) + return None else: # 1-get input @@ -706,22 +811,28 @@ def forward(self, ner_preds, rel_preds): if np.ndim(rpred_types) > 0: rel_embeds = rel_preds['rel_embeds'] else: - rel_embeds = torch.zeros((1, self.params['rel_reduced_size']), dtype=torch.float32, device=self.device) + rel_embeds = torch.zeros((1,self.params['rel_reduced_size']), dtype=torch.float32, device=self.device) # avoid scalar error rpred_types = np.array([rpred_types]) + + # event + ev_idx = ner_preds['ev_idxs'] + ev_truth = ner_preds['truth_evs'] + ev_lbls = np.array(ner_preds['ev_lbls']) + # 2-generate event candidates - ev_ids4nn = self.ev_struct_generator._generate(etypes, tr_ids, l2r, rpred_types, rpred_ids - ) + ev_ids4nn = self.ev_struct_generator._generate(etypes, tr_ids, l2r, rpred_types, rpred_ids, ev_idx, + ev_truth, ev_lbls) - # 3-embeds, prediction + # 3-embeds, prediction, and loss # check empty if len(ev_ids4nn['ev_cand_ids4nn']['trids_']) > 0: - ev_preds, empty_pred = self.calculate(ent_embeds, rel_embeds, rpred_types, ev_ids4nn) + ev_out, ev_loss = self.calculate(ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch) + return {'output': ev_out, 'loss': ev_loss} else: - ev_preds = None - empty_pred = True - - return ev_preds, empty_pred + # ev_out = None + # ev_loss = Variable(torch.zeros(1, device=self.device)) + return None \ No newline at end of file diff --git a/nets/NERNet.py b/model/NERNet.py similarity index 70% rename from nets/NERNet.py rename to model/NERNet.py index 6e8ccbc..58cc5ab 100644 --- a/nets/NERNet.py +++ b/model/NERNet.py @@ -2,8 +2,11 @@ import numpy as np import torch import torch.nn as nn +from torch.nn import functional as F -from bert.modeling import BertModel, BertPreTrainedModel +from torchnlp.word_to_vector.pretrained_word_vectors import _PretrainedWordVectors + +from bert.modeling import BertModel, BertPreTrainedModel, BertLayerNorm class NestedNERModel(BertPreTrainedModel): @@ -20,12 +23,44 @@ def __init__(self, config, params): self.max_span_width = params["max_span_width"] - self.bert = BertModel(config) + # for lstm + if self.params['use_lstm']: + self.pretrain_word_vectors = _PretrainedWordVectors( + name=params["pretrain_word_model"], + cache="caches", + ) + + self.lstm = nn.LSTM( + input_size=self.pretrain_word_vectors.dim, + hidden_size=config.hidden_size // 2, + num_layers=2, + batch_first=True, + dropout=config.hidden_dropout_prob, + bidirectional=True, + ) + + # or bert + else: + self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.entity_classifier = nn.Linear(config.hidden_size * 3, self.num_entities) - self.trigger_classifier = nn.Linear(config.hidden_size * 3, self.num_triggers) + if params['ner_reduce']: + reduced_size = params['ner_reduced_size'] + + # ! REDUCE + self.reduce = nn.Sequential( + nn.Linear(config.hidden_size * 3, reduced_size), + # nn.ReLU(), + # nn.Linear(1024, 1024), + BertLayerNorm(reduced_size, eps=1e-12), + nn.Dropout(config.hidden_dropout_prob), + ) + self.entity_classifier = nn.Linear(reduced_size, self.num_entities) + self.trigger_classifier = nn.Linear(reduced_size, self.num_triggers) + else: + self.entity_classifier = nn.Linear(config.hidden_size * 3, self.num_entities) + self.trigger_classifier = nn.Linear(config.hidden_size * 3, self.num_triggers) self.register_buffer( "label_ids", @@ -50,9 +85,25 @@ def forward( device = all_ids.device max_span_width = self.max_span_width - embeddings, sentence_embedding = self.bert( + # use bert + if self.params['use_lstm']: + word_embeddings = torch.stack([self.pretrain_word_vectors[tokens].to(device=device) for tokens in all_tokens]) + + self.lstm.flatten_parameters() + + lstm_embeddings, _ = self.lstm(word_embeddings) + + embeddings = lstm_embeddings + sentence_embedding = lstm_embeddings[:, 0] + + # or bert + else: + embeddings, sentence_embedding = self.bert( all_ids, attention_mask=all_attention_masks, output_all_encoded_layers=False - ) # (B, S, H) (B, 128, 768) + ) # (B, S, H) (B, 128, 768) + + # ! REDUCE + # embeddings = self.dropout(embeddings) # (B, S, H) (B, 128, 768) flattened_token_masks = all_token_masks.flatten() # (B * S, ) @@ -62,9 +113,17 @@ def forward( flattened_token_masks ) # (all_actual_tokens, ) - flattened_embeddings = torch.index_select( + # for lstm + if self.params['use_lstm']: + flattened_embeddings = torch.index_select( + embeddings.reshape(-1, embeddings.size(-1)), 0, flattened_embedding_indices + ) # (all_actual_tokens, H) + + # or bert + else: + flattened_embeddings = torch.index_select( embeddings.view(-1, embeddings.size(-1)), 0, flattened_embedding_indices - ) # (all_actual_tokens, H) + ) # (all_actual_tokens, H) span_starts = ( torch.arange(flattened_embeddings.size(0), device=device) @@ -177,6 +236,7 @@ def forward( span_start_embeddings, span_mean_embeddings, span_end_embeddings, + # span_width_embeddings, ), dim=1, ) # (all_valid_spans, H * 3 + distance_dim) @@ -210,6 +270,16 @@ def forward( all_span_masks ] # (all_valid_spans, num_entities + num_triggers) + actual_trigger_labels, actual_entity_labels = torch.split( + actual_span_labels, [self.num_triggers, self.num_entities], dim=-1 + ) # (all_valid_spans, num_entities), (all_valid_spans, num_triggers) + + # criterion = nn.CrossEntropyLoss(weight=self.class_weights) + + # return F.binary_cross_entropy_with_logits( + # preds, actual_span_labels, weight=self.class_weights + # ) # Computes loss + all_preds = torch.cat( (trigger_preds, entity_preds), dim=-1 ) # (all_valid_spans, num_entities + num_triggers) @@ -223,6 +293,16 @@ def forward( all_preds[~all_trigger_masks, : self.num_triggers] = 0 all_preds[~all_entity_masks, self.num_triggers:] = 0 + # Compute entity loss + entity_loss = F.binary_cross_entropy_with_logits( + entity_preds[all_entity_masks], actual_entity_labels[all_entity_masks] + ) + + # Compute trigger loss + trigger_loss = F.binary_cross_entropy_with_logits( + trigger_preds[all_trigger_masks], actual_trigger_labels[all_trigger_masks] + ) + # Support for random-noise adding trick entity_coeff = all_entity_masks.sum().float() trigger_coeff = all_trigger_masks.sum().float() @@ -231,6 +311,14 @@ def forward( entity_coeff /= denominator trigger_coeff /= denominator + if self.num_triggers > 0: + total_loss = entity_coeff * entity_loss + trigger_coeff * trigger_loss + else: + total_loss = entity_coeff * entity_loss + + # In case the corpus don't have triggers + # total_loss = entity_loss + _, all_preds_top_indices = torch.topk(all_preds, k=self.ner_label_limit, dim=-1) # Convert binary value to label ids @@ -268,7 +356,13 @@ def forward( all_aligned_preds = np.array(all_aligned_preds) + # For checking, will be commented if passes for all tests + # assert ( + # np.sort(all_aligned_preds, axis=-1) == np.sort(all_preds, axis=-1) + # ).all() + return ( + total_loss, all_aligned_preds, all_golds, sentence_sections, diff --git a/model/RELNet.py b/model/RELNet.py new file mode 100644 index 0000000..e511ae2 --- /dev/null +++ b/model/RELNet.py @@ -0,0 +1,309 @@ +import numpy as np +import torch +import torch.nn.functional as f +from torch import nn + +from eval.evalRE import calc_stats +from utils.utils import gelu + + +class RELModel(nn.Module): + """Relation layer.""" + + def __init__(self, params, sizes): + super(RELModel, self).__init__() + + # entity type + self.type_embed = nn.Embedding(num_embeddings=sizes['etype_size'] + 1, + embedding_dim=params['etype_dim'], + padding_idx=sizes['etype_size']) + + # entity dim + if params['ner_reduce'] == False: + ent_dim = params['bert_dim'] * 3 + params['etype_dim'] + else: + ent_dim = params['ner_reduced_size'] + params['etype_dim'] + + # layers + self.hidden_layer1 = nn.Linear(in_features=2 * ent_dim + params['bert_dim'], + out_features=params['hidden_dim'], bias=False) + self.hidden_layer2 = nn.Linear(in_features=params['hidden_dim'], + out_features=params['rel_reduced_size'], bias=False) + self.l_class = nn.Linear(in_features=params['rel_reduced_size'], + out_features=sizes['rel_size']) + + # others + self.device = params['device'] + self.params = params + self.sizes = sizes + + def _create_type_representation(self, bert_embeds, etypes_): + """Create entity type embeddings""" + + # get dim + self.b, self.w, _ = bert_embeds.shape + self.e = etypes_.shape[1] + + # non-entity + etypes_[etypes_ == -1] = self.sizes['etype_size'] + + # type embeddings + etype_embeds = self.type_embed(etypes_) # (batch_size, entity_dim, type_dim) + + return etype_embeds + + def _create_pair_representation(self, etok_embeds, etype_embeds): + """Create entity pair embeddings: Represent a sentence as a matrix of shape(B, E, E, dim)""" + + # concat: entities token and type embeddings + pair_embeds = torch.cat((etok_embeds, etype_embeds), dim=2) + + # save for event layer + type2_embeds = pair_embeds.clone() + + return pair_embeds, type2_embeds + + def _generate_l2r_pairs(self, pair_embeds, s_embeds, indices, rgtruth): + """Generate left-to-right pair candidates embeddings""" + + # pair embeddings + l2r_embeds = torch.cat( + (pair_embeds[(indices[0], indices[1])], pair_embeds[(indices[0], indices[2])], s_embeds[indices[0]]), + dim=-1) + + # pair labels + l2r_truth = [] + for b, l, r in zip(indices[0], indices[1], indices[2]): + l2r_truth.append(rgtruth[b.item()].get((l.item(), r.item()), -1)) + l2r_truth = np.asarray(l2r_truth) + + return l2r_embeds, l2r_truth + + def _generate_r2l_pairs(self, pair_embeds, s_embeds, indices, rgtruth): + """Generate right-to-left pair candidates embeddings""" + + # pair embeddings + r2l_embeds = torch.cat( + (pair_embeds[(indices[0], indices[2])], pair_embeds[(indices[0], indices[1])], s_embeds[indices[0]]), + dim=-1) + + # pair labels + r2l_truth = [] + for b, r, l in zip(indices[0], indices[2], indices[1]): + r2l_truth.append(rgtruth[b.item()].get((r.item(), l.item()), -1)) + r2l_truth = np.asarray(r2l_truth) + + return r2l_embeds, r2l_truth + + def _transpose_gold_indices(self, g_indices_): + """Extract gold pairs indices""" + # gold indices: batch, left, right + gids_b = [] + gids_l = [] + gids_r = [] + for b_idx, l2r_batch in enumerate(g_indices_): + if l2r_batch: + gids_b.extend([b_idx] * len(l2r_batch[0])) + gids_l.extend(l2r_batch[0]) + gids_r.extend(l2r_batch[1]) + g_indices = np.asarray([gids_b, gids_l, gids_r]) + return g_indices + + def predict(self, pair_embeds, g_indices_, p_indices, rgtruth_, sent_embeds): + """Classify relations.""" + + # 1-dropout + if self.training: + if self.params['dropout'] > 0: + pair_embeds = f.dropout(pair_embeds, p=self.params['dropout']) + + # 2-transpose gold pairs indices + g_indices = self._transpose_gold_indices(g_indices_) + + # 3-create left-to-right pairs + # 3.1-training mode + if not self.params['predict']: + + # i-gold ner + if self.training and self.params['use_gold_ner']: + use_gold = True + + # ii-train relation only: use gold ner + elif not self.training and self.params['skip_ner'] and self.params['rel_epoch'] >= ( + self.params['epoch'] - 1) and self.params['use_gold_ner']: + use_gold = True + + # iii-train event only: use gold rel + elif not self.training and self.params['skip_ner'] and self.params['skip_rel'] and self.params[ + 'use_gold_rel']: + use_gold = True + + # iv- + else: + use_gold = False + + # 3.2-predict mode + else: + + # gold or pipeline + if self.params['gold_eval'] or self.params['pipelines']: + if self.params['pipelines'] and self.params['pipe_flag'] != 2: + use_gold = False + else: + use_gold = True + + # joint + else: + use_gold = False + + # 3.3-get pair candidates embeddings and labels: from gold or predicted indices + if use_gold: + l2r_embeds, l2r_truth = self._generate_l2r_pairs(pair_embeds, sent_embeds, g_indices, rgtruth_) + else: + l2r_embeds, l2r_truth = self._generate_l2r_pairs(pair_embeds, sent_embeds, p_indices, rgtruth_) + + # 4-for non-relation label + if not self.params['predict']: + if np.ndim(l2r_truth) > 0: + l2r_truth[l2r_truth == -1] = self.params['mappings']['rel_map']['1:Other:2'] + else: + if l2r_truth == -1: + l2r_truth = self.params['mappings']['rel_map']['1:Other:2'] + l2r_truth = np.array([l2r_truth]) + + # 5-NN on left-to-right pairs + rel_l2r_embeds = gelu(self.hidden_layer1(l2r_embeds)) + rel_l2r_embeds = gelu(self.hidden_layer2(rel_l2r_embeds)) + l2r_preds = self.l_class(rel_l2r_embeds) # (B*r, N) + + # 6-check dim + if not self.params['predict']: + assert (l2r_preds.shape[0] == l2r_truth.shape[0]), \ + "mismatch in ground-truth & prediction shapes left-to-right" + + # 7-both directions + if self.params['direction'] != 'l2r': + + # training mode + if not self.params['predict']: + + # i-gold ner + if self.training and self.params['use_gold_ner']: + use_gold = True + + # ii-train rel only + elif not self.training and self.params['skip_ner'] and self.params['rel_epoch'] >= ( + self.params['epoch'] - 1) and self.params['use_gold_ner']: + use_gold = True + + # iii-train ev only + elif not self.training and self.params['skip_ner'] and self.params['skip_rel'] and self.params[ + 'use_gold_rel']: + use_gold = True + + # iv + else: + use_gold = False + + # predict mode + else: + + # gold or pipeline + if self.params['gold_eval'] or self.params['pipelines']: + if self.params['pipelines'] and self.params['pipe_flag'] != 2: + use_gold = False + else: + use_gold = True + else: + use_gold = False + + # pair candidates embeddings and labels + if use_gold: + r2l_embeds, r2l_truth = self._generate_r2l_pairs(pair_embeds, sent_embeds, g_indices, rgtruth_) + else: + r2l_embeds, r2l_truth = self._generate_r2l_pairs(pair_embeds, sent_embeds, p_indices, rgtruth_) + + # non-relation type + if not self.params['predict']: + if np.ndim(r2l_truth) > 0: + r2l_truth[r2l_truth == -1] = self.params['mappings']['rel_map']['1:Other:2'] + else: + if r2l_truth == -1: + r2l_truth = self.params['mappings']['rel_map']['1:Other:2'] + r2l_truth = np.array([r2l_truth]) + + # NN for right-to-left pairs + rel_r2l_embeds = gelu(self.hidden_layer1(r2l_embeds)) + rel_r2l_embeds = gelu(self.hidden_layer2(rel_r2l_embeds)) + r2l_preds = self.l_class(rel_r2l_embeds) + + # check dim + if not self.params['predict']: + assert (r2l_preds.shape[0] == r2l_truth.shape[0]), \ + "mismatch in ground-truth & prediction shapes right-to-left" + + # both directions + return rel_l2r_embeds, l2r_preds, l2r_truth, rel_r2l_embeds, r2l_preds, r2l_truth, pair_embeds, g_indices + + # only left-to-right + else: + return rel_l2r_embeds, l2r_preds, l2r_truth, pair_embeds, g_indices + + def forward(self, batch_input): + + # 1-entity type embeddings + type_embeds = self._create_type_representation(batch_input['embeddings'], batch_input['ent_types']) + + # 2-create pair embeddings + pair_embeds, type2_embeds = self._create_pair_representation(batch_input['ent_embeds'], type_embeds) + pair_embeds = pair_embeds.view(self.b, self.e, pair_embeds.shape[2]) + + # 3-predictions and labels + predictions = self.predict(pair_embeds, batch_input['l2rs'], batch_input['pairs_idx'], batch_input['gtruths'], + batch_input['sentence_embeds']) + + # 4-classify: use both directions + acc_loss = 0 + if self.params['direction'] != 'lr2': + + # get output + rel_l2r_embeds, l2r_preds, l2r_truth, rel_r2l_embeds, r2l_preds, r2l_truth, pair_embeds, g_indices = predictions + + # training + if not self.params['predict']: + if l2r_preds.shape[0] == 0: + return {'valid': False} + + # relation loss + l2r_loss = f.cross_entropy(l2r_preds, torch.tensor(l2r_truth, device=self.device).long()) + r2l_loss = f.cross_entropy(r2l_preds, torch.tensor(r2l_truth, device=self.device).long()) + acc_loss = l2r_loss + r2l_loss + + # prediction and label + r_preds = (f.softmax(l2r_preds, dim=1).data, f.softmax(r2l_preds, dim=1).data) + r_gtruth = (l2r_truth, r2l_truth) + + # use only left-to-right direction + else: + + # get output + rel_l2r_embeds, l2r_preds, l2r_truth, pair_embeds, g_indices = predictions + + # training + if not self.params['predict']: + # relation loss + acc_loss = f.cross_entropy(l2r_preds, torch.tensor(l2r_truth, device=self.device).long()) + + # prediction and label + r_preds = f.softmax(l2r_preds, dim=1).data + r_gtruth = l2r_truth.data + + # get predicted type and scores + new_rpreds, new_rgtruth, no_matched_rels, true_pos, false_pos, false_neg = calc_stats(r_preds, r_gtruth, + self.params) + + return {'valid': True, 'true_pos': true_pos, 'false_pos': false_pos, 'false_neg': false_neg, + 'preds': new_rpreds, 'enttoks_type_embeds': type2_embeds, + 'truth': new_rgtruth, 'no_matched_rel': no_matched_rels, + 'l2r': g_indices, 'pairs_idx': batch_input['pairs_idx'], 'rel_embeds': rel_l2r_embeds, + 'pair4class': pair_embeds, 'loss': acc_loss} diff --git a/model/deepEM.py b/model/deepEM.py new file mode 100644 index 0000000..4e8958c --- /dev/null +++ b/model/deepEM.py @@ -0,0 +1,484 @@ +import copy +from collections import defaultdict + +import numpy as np +import torch +import torch.nn.functional as f +from torch import nn +from torch.autograd import Variable + +from model import EVNet +from model import RELNet +from model.NERNet import NestedNERModel +from utils import utils + +cpu_device = torch.device("cpu") + + +class DeepEM(nn.Module): + """ + Network architecture + """ + + def __init__(self, params): + super(DeepEM, self).__init__() + + sizes = params['voc_sizes'] + device = params['device'] + + self.NER_layer = NestedNERModel.from_pretrained(params['bert_model'], params=params) + self.REL_layer = RELNet.RELModel(params, sizes) + self.EV_layer = EVNet.EVModel(params, sizes) + + self.trigger_id = -1 + + if params['train']: + self.beta = 1 + else: + self.beta = params['beta'] + + self.device = device + self.params = params + + def process_ner_output(self, nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_entity_masks, nn_trigger_masks, + nn_span_labels, span_terms, max_span_labels, nn_span_indices): + """Process NER output to prepare for training relation and event layers""" + + # entity output + ner_preds = {} + + # predict entity + ner_loss, e_preds, e_golds, sentence_sections, span_masks, embeddings, sentence_emb, trigger_indices = self.NER_layer( + all_tokens=nn_tokens, + all_ids=nn_ids, + all_token_masks=nn_token_mask, + all_attention_masks=nn_attention_mask, + all_entity_masks=nn_entity_masks, + all_trigger_masks=nn_trigger_masks, + all_span_labels=nn_span_labels, + ) + + # ! Note that these below lines run on CPU + sentence_sections = sentence_sections.detach().cpu().numpy()[:-1] + all_span_masks = span_masks.detach() > 0 + + # Embedding of each span + embeddings = torch.split(embeddings, torch.sum(all_span_masks, dim=-1).tolist()) + + # Pred of each span + e_preds = np.split(e_preds.astype(int), sentence_sections) + e_preds = [pred.flatten() for pred in e_preds] + ner_preds['preds'] = e_preds + + e_golds = np.split(e_golds.astype(int), sentence_sections) + e_golds = [gold.flatten() for gold in e_golds] + ner_preds['golds'] = e_golds + ner_preds['gold_terms'] = copy.deepcopy(span_terms) + + replace_term = True + if self.params['predict']: + if self.params['gold_eval'] or (self.params['pipelines'] and self.params['pipe_flag'] != 0): + replace_term = False + + if self.params["ner_predict_all"]: + if self.params['predict']: + if self.params['gold_eval'] or (self.params['pipelines'] and self.params['pipe_flag'] != 0): + e_preds = e_golds + span_terms = ner_preds['gold_terms'] + else: + if self.params['skip_ner'] and self.params['skip_rel'] and self.params['use_gold_ner'] and self.params[ + 'use_gold_rel']: + e_preds = e_golds + span_terms = ner_preds['gold_terms'] + + if replace_term: + for items in span_terms: + items.term2id.clear() + items.id2term.clear() + + # Overwrite triggers + if self.trigger_id == -1: + self.trigger_id = utils.get_max_entity_id(span_terms) + 10000 + + trigger_idx = self.trigger_id + 1 + for sentence_idx, span_preds in enumerate(e_preds): + for pred_idx, label_id in enumerate(span_preds): + if label_id > 0: + term = "T" + str(trigger_idx) + + # check trigger + if label_id in self.params['mappings']['nn_mapping']['trTypes_Ids']: + term = "TR" + str(trigger_idx) + + span_terms[sentence_idx].id2term[pred_idx] = term + span_terms[sentence_idx].term2id[term] = pred_idx + trigger_idx += 1 + + self.trigger_id = trigger_idx + else: + if replace_term: + # Overwrite triggers + if self.trigger_id == -1: + self.trigger_id = utils.get_max_entity_id(span_terms) + 10000 + + trigger_idx = self.trigger_id + 1 + for sentence_idx, span_preds in enumerate(e_preds): + # Update gold labels + + # store gold entity index (a1) + a1ent_set = set() + + for span_idx, span_term in span_terms[sentence_idx].id2term.items(): + + if span_term != "O" and not span_term.startswith("TR") and span_preds[span_idx] != 255: + + # but do not replace for entity in a2 files + span_label = span_terms[sentence_idx].id2label[ + span_idx] # entity type, e.g: Gene_or_gene_product + if span_label not in self.params['a2_entities']: + # replace for entity (using gold entity) + span_preds[span_idx] = e_golds[sentence_idx][span_idx] + + # save this index to ignore prediction + a1ent_set.add(span_idx) + + for pred_idx, label_id in enumerate(span_preds): + span_term = span_terms[sentence_idx].id2term.get(pred_idx, "O") + + # if this entity in a1: skip this span + if pred_idx in a1ent_set: + continue + + remove_span = False + + # add prediction for trigger or entity a2 + if label_id > 0: + term = '' + + # check trigger + if label_id in self.params['mappings']['nn_mapping']['trTypes_Ids']: + term = "TR" + str(trigger_idx) + + # is entity + else: + etype_label = self.params['mappings']['nn_mapping']['id_tag_mapping'][label_id] + + # check this entity type in a2 or not + if etype_label in self.params['a2_entities']: + term = "T" + str(trigger_idx) + else: + remove_span = True + + if len(term) > 0: + span_terms[sentence_idx].id2term[pred_idx] = term + span_terms[sentence_idx].term2id[term] = pred_idx + trigger_idx += 1 + + # null prediction + if label_id == 0 or remove_span: + # do not write anything + span_preds[pred_idx] = 0 + + # remove this span + if span_term.startswith("T"): + del span_terms[sentence_idx].id2term[pred_idx] + del span_terms[sentence_idx].term2id[span_term] + + span_preds[span_preds == 255] = 0 + self.trigger_id = trigger_idx + + num_padding = max_span_labels * self.params["ner_label_limit"] + + e_preds = [np.pad(pred, (0, num_padding - pred.shape[0]), + 'constant', constant_values=-1) for pred in e_preds] + e_golds = [np.pad(gold, (0, num_padding - gold.shape[0]), + 'constant', constant_values=-1) for gold in e_golds] + + e_preds = torch.tensor(e_preds, device=self.device) + nn_span_labels = torch.tensor(e_golds, device=self.device) + + embeddings = [f.pad(embedding, (0, 0, 0, max_span_labels - embedding.shape[0]), + 'constant', value=0) for embedding in embeddings] + + embeddings = torch.stack(embeddings) + embeddings = embeddings.unsqueeze(dim=2).expand(-1, -1, self.params["ner_label_limit"], -1) + embeddings = embeddings.reshape(embeddings.size(0), -1, embeddings.size(-1)) + + # output for ner + ner_preds['loss'] = ner_loss + ner_preds['terms'] = span_terms + ner_preds['span_indices'] = nn_span_indices + + # For pre-train event layer + use_gold = False + if (not self.params['predict'] and self.params['skip_ner'] and self.params['skip_rel'] and self.params[ + 'use_gold_ner'] and self.params['use_gold_rel']) or (self.params['gold_eval'] or self.params['pipelines']): + use_gold = True + if use_gold: + ner_preds['nner_preds'] = e_golds + else: + ner_preds['nner_preds'] = e_preds.detach().cpu().numpy() + + return embeddings, e_preds, e_golds, nn_span_labels, sentence_emb, ner_preds + + def generate_entity_pairs_4rel(self, bert_embeds, p_span_indices, g_span_indices): + """Prepare entity pairs for relation candidates""" + + # use gold or predicted span indices + # training mode + if not self.params['predict']: + if self.training and self.params['use_gold_ner']: + use_gold = True + # train relation only + elif not self.training and self.params['skip_ner'] and self.params['rel_epoch'] >= ( + self.params['epoch'] - 1) and self.params['use_gold_ner']: + use_gold = True + # train event only + elif not self.training and self.params['skip_ner'] and self.params['skip_rel'] and self.params[ + 'use_gold_rel']: + use_gold = True + else: + use_gold = False + + # predict mode + else: + if self.params['gold_eval'] or self.params['pipelines']: + use_gold = True + else: + use_gold = False + + if use_gold: + span_indices = g_span_indices + else: + span_indices = p_span_indices + + # positive indices + pos_indices = (span_indices > 0).nonzero().transpose(0, 1).long() + + # entity types + e_types = torch.full((span_indices.shape[0], span_indices.shape[1]), -1, dtype=torch.int64, + device=self.device) + + # entity and trigger indices + e_indices = torch.zeros((span_indices.shape[0], span_indices.shape[1]), dtype=torch.long) + tr_indices = torch.zeros((span_indices.shape), dtype=torch.int64, device=self.device) + + # store entity indices in batch and list of triggers + batch_eids_list = defaultdict(list) + tr_list = [] + + # store entity in each batch + batch_ent_list = defaultdict(list) + + for batch_id, a1id in enumerate(pos_indices[0]): + + # index + a2id = pos_indices[1][batch_id] + + # entity type + type_a1 = self.params['mappings']['nn_mapping']['tag2type_map'][span_indices[a1id][a2id].item()] + e_types[a1id][a2id] = torch.tensor(type_a1, device=self.device) + + # masked + e_indices[a1id][a2id] = 1 + + # trigger + if type_a1 in self.params['trTypes_Ids']: + tr_indices[a1id][a2id] = 1 + tr_list.append((a1id, a2id)) + + # entity + else: + batch_ent_list[a1id.item()].append(a2id) + + batch_eids_list[a1id.item()].append(a2id) + + # prepare for entity and trigger embeddings + e_embeds = bert_embeds.clone() + tr_embeds = bert_embeds.clone() + e_embeds[e_indices == 0] = torch.zeros((bert_embeds.shape[2]), dtype=bert_embeds.dtype, device=self.device) + tr_embeds[tr_indices == 0] = torch.zeros((bert_embeds.shape[2]), dtype=bert_embeds.dtype, device=self.device) + + # indices of pairs (trigger-entity OR trigger-trigger) for relation candidates + pair_indices = [] + + if len(tr_list): + for batch_id, trig_id in tr_list: + if len(batch_eids_list[batch_id.item()]) > 1: + + # enable relation between triggers + if self.params['enable_triggers_pair']: + # get all entity ids in this batch + b_eids = batch_eids_list[batch_id.item()].copy() + + # remove this trigger to avoid self relation + b_eids.remove(trig_id.clone().detach()) + + # or only between trigger and entity + else: + # pair with only entity + b_eids = batch_ent_list[batch_id.item()].copy() + + # check empty + if len(b_eids) > 0: + # make pairs + batch_pair_idx = torch.tensor([[batch_id], [trig_id]]).repeat(1, len(b_eids)) + batch_pair_idx = torch.cat( + (batch_pair_idx, torch.tensor(b_eids).view(1, len(b_eids))), dim=0) + + # add to pairs + pair_indices.append(batch_pair_idx) + + if len(pair_indices) > 0: + pair_indices = torch.cat(pair_indices, dim=-1) + + return e_embeds, tr_embeds, e_types, tr_indices, pair_indices + + def _init_joint(self, n_epoch): + """Flags to enable using the predicted from the previous output or not""" + + # init layer output + rel_preds = None + ev_preds = None + + # enable jointly training + enable_rel = True + enable_ev = True + + # training + if not self.params['predict']: + + # pre-train ner only: unable relation and event layers + if not self.params['skip_ner'] and n_epoch <= self.params['ner_epoch']: + enable_rel = False + enable_ev = False + + # pre-train relation only: unable event layer + if not self.params['skip_rel'] and n_epoch <= self.params['rel_epoch']: + enable_ev = False + + # predict on pipeline mode + elif self.params['predict'] and self.params['pipelines']: + + # for ner + if self.params['pipe_flag'] == 0: + enable_rel = False + enable_ev = False + + # for relation + elif self.params['pipe_flag'] == 1: + enable_rel = True + enable_ev = False + + # for event + else: + enable_rel = False + enable_ev = True + + return enable_rel, enable_ev, rel_preds, ev_preds + + def _accumulate_loss(self, ner_preds, rel_preds, ev_preds, n_epoch): + """To calculate the total loss from the layers' loss""" + + # total loss + acc_loss = 0 + + if not self.params['predict']: + # add ner loss + if not self.params['skip_ner']: + + # add scaled loss according to the epoch range + if n_epoch <= self.params['ner_epoch_limit']: + acc_loss = ner_preds['loss'] * self.params['ner_loss_weight_main'] + else: + acc_loss = ner_preds['loss'] * self.params['ner_loss_weight_minor'] + + # add relation loss + if not self.params['skip_rel'] and rel_preds != None: + + # check non-empty + if rel_preds['valid']: + + # add scaled loss according to the epoch range + if n_epoch <= self.params['rel_epoch_limit'] and n_epoch > self.params['ner_epoch_limit']: + acc_loss += rel_preds['loss'] * self.params['rel_loss_weight_main'] + else: + acc_loss += rel_preds['loss'] * self.params['rel_loss_weight_minor'] + + # add event loss + if ev_preds != None: + + # add scaled loss according to the epoch range + if n_epoch <= self.params['rel_epoch_limit']: + acc_loss += ev_preds['loss'] * self.params['ev_loss_weight_minor'] + else: + acc_loss += ev_preds['loss'] * self.params['ev_loss_weight_main'] + + # zero + if acc_loss == 0: + acc_loss = Variable(torch.zeros(1, device=self.params['device'])) + + return acc_loss + + def forward(self, batch_input, n_epoch=0): + + """Joint model interface.""" + + # 1 - get input + nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, nn_gtruth, nn_l2r, span_terms, \ + nn_truth_ev, nn_ev_idxs, ev_lbls, etypes, max_span_labels = batch_input + + # 2 - predict entity and process output + embeddings, e_preds, e_golds, nn_span_labels, sentence_emb, ner_preds = self.process_ner_output( + nn_tokens, nn_ids, + nn_token_mask, + nn_attention_mask, + nn_entity_masks, + nn_trigger_masks, + nn_span_labels, + span_terms, + max_span_labels, + nn_span_indices + ) + + # 3 - initialize joint training + enable_rel, enable_ev, rel_preds, ev_preds = self._init_joint(n_epoch) + + # 4 - joint training + if enable_rel or enable_ev: + + # 4.1 - prepare input for joint model + e_embeds, tr_embeds, e_types, tr_ids, pair_indices = self.generate_entity_pairs_4rel(bert_embeds=embeddings, + p_span_indices=e_preds, + g_span_indices=nn_span_labels) + + # check non-empty + if len(pair_indices) > 0: + + joint_input = {'preds': e_preds, 'golds': e_golds, 'embeddings': embeddings, + 'ent_embeds': e_embeds, 'tr_embeds': tr_embeds, 'tr_ids': tr_ids, + 'ent_types': e_types, 'pairs_idx': pair_indices, 'e_types': etypes.long(), + 'l2rs': nn_l2r, + 'gtruths': nn_gtruth, 'truth_evs': nn_truth_ev, 'ev_idxs': nn_ev_idxs, + 'ev_lbls': ev_lbls, + 'sentence_embeds': sentence_emb} + + # 4.2 - training relation layer + if enable_rel: + rel_preds = self.REL_layer(joint_input) + + # 4.4 - training event layer + if enable_ev: + + # get relation output + rel_preds = self.REL_layer(joint_input) + + # check non-empty relation + if rel_preds['valid']: + # call event layer + ev_preds = self.EV_layer(joint_input, rel_preds, n_epoch) + + # joint model loss + acc_loss = self._accumulate_loss(ner_preds, rel_preds, ev_preds, n_epoch) + + return ner_preds, rel_preds, ev_preds, acc_loss diff --git a/model/training.py b/model/training.py new file mode 100644 index 0000000..6414bf9 --- /dev/null +++ b/model/training.py @@ -0,0 +1,225 @@ +import torch +from tqdm import tqdm, trange + +import os +import pickle + +from eval.evaluation import eval +from utils import utils +from utils.utils import debug, path +from utils.utils import ( + extract_scores, + is_best_epoch, + write_best_epoch, +) + + +# try: +# from apex import amp +# except ImportError: +# pass + + +def train( + train_data_loader, + dev_data_loader, + train_data, + dev_data, + params, + model, + optimizer +): + is_params_saved = False + global_steps = 0 + + gradient_accumulation_steps = params["gradient_accumulation_steps"] + + ner_prf_dev, rel_prf_dev, ev_prf_dev = [], [], [] + + ner_prf_dev_str, ner_prf_dev_sof, rel_prf_dev_str, rel_prf_dev_sof = [], [], [], [] + + # create output directory for results + result_dir = params['result_dir'] + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + if params['freeze_ner']: + for p in model.NER_layer.parameters(): + p.requires_grad = False + + if params['freeze_rel']: + for p in model.REL_layer.parameters(): + p.requires_grad = False + + if params['freeze_bert']: + for p in model.NER_layer.bert.parameters(): + p.requires_grad = False + + # Save params: + if params['save_params']: + if not is_params_saved: + saved_params_path = result_dir + params['task_name'] + '.param' + with open(saved_params_path, "wb") as f: + pickle.dump(params, f) + # is_params_saved = True + print('SAVED PARAMETERS!') + + for epoch in trange(int(params["epoch"]), desc="Epoch"): + # TRAIN loop + model.train() + tr_loss = 0 + nb_tr_steps = 0 + + print() + print( + "====================================================================================================================") + print() + debug(f"[1] Epoch: {epoch}\n") + + for step, batch in enumerate( + tqdm(train_data_loader, desc="Iteration", leave=False) + ): + + # Start training batch + tr_data_ids = batch + tensors = utils.get_tensors(tr_data_ids, train_data, params) + + ner_preds, rel_preds, ev_preds, loss = model(tensors, epoch) + + if gradient_accumulation_steps > 1: + loss /= gradient_accumulation_steps + + tr_loss += loss.item() + nb_tr_steps += 1 + + if loss != 0: + if params["fp16"]: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if (step + 1) % params["gradient_accumulation_steps"] == 0: + + optimizer.step() + optimizer.zero_grad() + global_steps += 1 + + # Clear GPU unused RAM: + if params['gpu'] >= 0: + torch.cuda.empty_cache() + + print() + debug(f"[2] Train loss: {tr_loss / nb_tr_steps}\n") + debug(f"[3] Global steps: {global_steps}\n") + + print( + "+" * 10 + "RUN EVALUATION" + "+" * 10 + ) + ner_score, is_eval_rel, tr_scores, scores, ev_scores = eval( + model=model, + eval_dir=params['dev_data'], + result_dir=result_dir, + eval_dataloader=dev_data_loader, + eval_data=dev_data, + params=params, + epoch=epoch + ) + + ner_prf_dev.append( + [ + float("{0:.2f}".format(ner_score[-1][1])), + float("{0:.2f}".format(ner_score[-1][2])), + float("{0:.2f}".format(ner_score[-1][3])), + ] + ) + ner_prf_dev_str.append( + [ + float("{0:.2f}".format(scores['NER']['micro']['st_p'])), + float("{0:.2f}".format(scores['NER']['micro']['st_r'])), + float("{0:.2f}".format(scores['NER']['micro']['st_f'])), + ] + ) + ner_prf_dev_sof.append( + [ + float("{0:.2f}".format(scores['NER']['micro']['so_p'])), + float("{0:.2f}".format(scores['NER']['micro']['so_r'])), + float("{0:.2f}".format(scores['NER']['micro']['so_f'])), + ] + ) + extract_scores('DEV NER', ner_prf_dev) + ner_max_scores = extract_scores('n2c2 ner strict (micro)', ner_prf_dev_str) + extract_scores('n2c2 ner soft (micro)', ner_prf_dev_sof) + + if is_eval_rel: + rel_prf_dev.append( + [ + float("{0:.2f}".format(tr_scores["micro_p"] * 100)), + float("{0:.2f}".format(tr_scores["micro_r"] * 100)), + float("{0:.2f}".format(tr_scores["micro_f"] * 100)), + ] + ) + rel_prf_dev_str.append( + [ + float("{0:.2f}".format(scores['REL']['micro']['st_p'])), + float("{0:.2f}".format(scores['REL']['micro']['st_r'])), + float("{0:.2f}".format(scores['REL']['micro']['st_f'])), + ] + ) + rel_prf_dev_sof.append( + [ + float("{0:.2f}".format(scores['REL']['micro']['so_p'])), + float("{0:.2f}".format(scores['REL']['micro']['so_r'])), + float("{0:.2f}".format(scores['REL']['micro']['so_f'])), + ] + ) + extract_scores('DEV REL', rel_prf_dev) + rel_max_scores = extract_scores('n2c2 rel strict (micro)', rel_prf_dev_str) + extract_scores('n2c2 rel soft (micro)', rel_prf_dev_sof) + else: + rel_prf_dev.append( + [ + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + ] + ) + rel_prf_dev_str.append( + [ + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + ] + ) + rel_prf_dev_sof.append( + [ + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + ] + ) + if len(ev_scores) > 0: + ev_prf_dev.append([ev_scores["p"], ev_scores["r"], ev_scores["f"]]) + ev_max_scores = extract_scores('DEV EV', ev_prf_dev) + best_epoch = is_best_epoch(ev_prf_dev) + if best_epoch: + write_best_epoch(result_dir) + else: + ev_prf_dev.append( + [ + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + float("{0:.2f}".format(0)), + ] + ) + + # Clear GPU unused RAM: + if params['gpu'] >= 0: + torch.cuda.empty_cache() + + # if params['optimize_type'] == 0: + # return ner_max_scores + # elif params['optimize_type'] == 1: + # return rel_max_scores + # else: + # return ev_max_scores diff --git a/nets/RELNet.py b/nets/RELNet.py deleted file mode 100644 index 2d06602..0000000 --- a/nets/RELNet.py +++ /dev/null @@ -1,147 +0,0 @@ -import torch -import torch.nn.functional as f -from torch import nn - -from eval.evalRE import calc_stats - -import math - - -def gelu(x): - return 0.5 * x * (1 + torch.tanh(math.sqrt(math.pi / 2) * (x + 0.044715 * x ** 3))) - - -class RELModel(nn.Module): - - def __init__(self, params, sizes): - super(RELModel, self).__init__() - - self.type_embed = nn.Embedding(num_embeddings=sizes['etype_size'] + 1, - embedding_dim=params['etype_dim'], - padding_idx=sizes['etype_size']) - - ent_dim = params['bert_dim'] * 3 + params['etype_dim'] - - - self.hidden_layer1 = nn.Linear(in_features=2 * ent_dim + params['bert_dim'], - out_features=params['hidden_dim'], bias=False) - - self.hidden_layer2 = nn.Linear(in_features=params['hidden_dim'], - out_features=params['rel_reduced_size'], bias=False) - - self.l_class = nn.Linear(in_features=params['rel_reduced_size'], - out_features=sizes['rel_size']) - - self.device = params['device'] - self.params = params - self.sizes = sizes - - def embedding_layer(self, bert_out, ents_etype_): - - self.b, self.w, _ = bert_out.shape - self.e = ents_etype_.shape[1] - - ents_etype_[ents_etype_ == -1] = self.sizes['etype_size'] - type_embeds = self.type_embed(ents_etype_) # (B, E, 10). - - return type_embeds - - def pair_representation(self, ent_embeds, tr_ids, type_embeds): - - pairs4class = torch.cat((ent_embeds, type_embeds), dim=2) - - enttoks_type_embeds = pairs4class.clone() - - return pairs4class, enttoks_type_embeds - - - def get_pairs(self, pairs4class, pair_context, pairs_idx, direction, use_gold, use_context): - indices = pairs_idx - - if direction == 'lr': - if use_context: - return torch.cat( - (pairs4class[(indices[0], indices[1])], pairs4class[(indices[0], indices[2])], pair_context), - dim=-1) - else: - return torch.cat((pairs4class[(indices[0], indices[1])], pairs4class[(indices[0], indices[2])]), dim=-1) - else: - if use_context: - return torch.cat( - (pairs4class[(indices[0], indices[2])], pairs4class[(indices[0], indices[1])], pair_context), - dim=-1) - else: - return torch.cat((pairs4class[(indices[0], indices[2])], pairs4class[(indices[0], indices[1])]), dim=-1) - - def classification(self, pairs4class, pairs_idx_, sent_embeds): - - - if self.params['predict']: - - pair_context = sent_embeds[pairs_idx_[0]] - - l2r_pairs = self.get_pairs(pairs4class, pair_context, pairs_idx_, 'lr', False, - self.params['use_context']) - - l2r_pairs = gelu(self.hidden_layer1(l2r_pairs)) - l2r_pairs = gelu(self.hidden_layer2(l2r_pairs)) - - pairs_preds_l2r = self.l_class(l2r_pairs) # (B*r, N) - - - if self.params['direction'] != 'l2r': - - if self.params['predict']: - pair_context = sent_embeds[pairs_idx_[0]] - r2l_pairs = self.get_pairs(pairs4class, pair_context, pairs_idx_, 'rl', False, - self.params['use_context']) - - - r2l_pairs = gelu(self.hidden_layer1(r2l_pairs)) - r2l_pairs = gelu(self.hidden_layer2(r2l_pairs)) - - pairs_preds_r2l = self.l_class(r2l_pairs) - - - - return pairs_preds_l2r, pairs_preds_r2l, l2r_pairs, r2l_pairs, pairs4class, pairs_idx_ - else: - return pairs_preds_l2r, pairs4class, pairs_idx_ - - def calculate(self, batch_input): - type_embeds = self.embedding_layer(batch_input['embeddings'], batch_input['ent_types']) - - sent_embeds = batch_input['sentence_embeds'] - - pairs4class, enttoks_type_embeds = self.pair_representation( - ent_embeds=batch_input['ent_embeds'], tr_ids=batch_input['tr_ids'], - type_embeds=type_embeds) - - pairs4class = pairs4class.view(self.b, self.e, pairs4class.shape[2]) - - forw_comp_res = self.classification(pairs4class=pairs4class, - pairs_idx_=batch_input['pairs_idx'], - sent_embeds=sent_embeds) - - return forw_comp_res, enttoks_type_embeds - - def forward(self, batch_input): - if len(batch_input['pairs_idx']) > 0: - fcomp_res, enttoks_type_embeds = self.calculate(batch_input) - - if self.params['direction'] != 'lr2': - preds_l2r, preds_r2l, l2r_pairs, r2l_pairs, pair4class, pairs_idx = fcomp_res - preds = (f.softmax(preds_l2r, dim=1).data, f.softmax(preds_r2l, dim=1).data) - else: - preds_l2r, l2r_pairs, pair4class, pairs_idx, positive_indices = fcomp_res - preds = f.softmax(preds_l2r, dim=1).data - - new_preds = calc_stats(preds,self.params) - - return {'next': True, - 'preds': new_preds, 'enttoks_type_embeds': enttoks_type_embeds, - 'pairs_idx': pairs_idx, 'rel_embeds': l2r_pairs, - 'pair4class': pair4class} - - else: - return {'next': False} diff --git a/nets/deepEM.py b/nets/deepEM.py deleted file mode 100644 index 33293e2..0000000 --- a/nets/deepEM.py +++ /dev/null @@ -1,298 +0,0 @@ -from collections import defaultdict - -import numpy as np -import torch -import torch.nn.functional as f -from torch import nn - -from nets import EVNet -from nets import RELNet -from nets.NERNet import NestedNERModel -from utils import utils - -cpu_device = torch.device("cpu") - - -class DeepEM(nn.Module): - """ - Network architecture - """ - - def __init__(self, params): - super(DeepEM, self).__init__() - - sizes = params['voc_sizes'] - device = params['device'] - - self.NER_layer = NestedNERModel.from_pretrained(params['bert_model'], params=params) - self.REL_layer = RELNet.RELModel(params, sizes) - self.EV_layer = EVNet.EVModel(params, sizes) - - self.trigger_id = -1 - - if params['train']: - self.beta = 1 - else: - self.beta = params['beta'] - - self.device = device - self.params = params - - def is_tr(self, label): - nn_tr_types_ids = self.params['mappings']['nn_mapping']['trTypes_Ids'] - return label in nn_tr_types_ids - - def generate_entity_pairs_4rel(self, bert_out, preds): - - lbls = preds - - labeled_spans = (lbls > 0).nonzero().transpose(0, 1).long() - - ent_types = torch.full((lbls.shape[0], lbls.shape[1]), -1, dtype=torch.int64, device=self.device) - - e_ids = torch.zeros((lbls.shape[0], lbls.shape[1]), dtype=torch.long) - tr_ids = torch.zeros((lbls.shape), dtype=torch.int64, device=self.device) - - batch_eids_list = defaultdict(list) - trig_list = [] - - # store only entity in each batch - batch_ent_list = defaultdict(list) - - for idx, i in enumerate(labeled_spans[0]): - j = labeled_spans[1][idx] - type_a1 = self.params['mappings']['nn_mapping']['tag2type_map'][lbls[i][j].item()] - ent_types[i][j] = torch.tensor(type_a1, device=self.device) - e_ids[i][j] = 1 - - if type_a1 in self.params['trTypes_Ids']: - tr_ids[i][j] = 1 - trig_list.append((i, j)) - else: - batch_ent_list[i.item()].append(j) - - batch_eids_list[i.item()].append(j) - - ent_embeds = bert_out.clone() - tr_embeds = bert_out.clone() - ent_embeds[e_ids == 0] = torch.zeros((bert_out.shape[2]), dtype=bert_out.dtype, device=self.device) - tr_embeds[tr_ids == 0] = torch.zeros((bert_out.shape[2]), dtype=bert_out.dtype, device=self.device) - - pairs_idx = [] - - if len(trig_list): - for batch_id, trig_id in trig_list: - if len(batch_eids_list[batch_id.item()]) > 1: - - # enable relation between triggers - if self.params['enable_triggers_pair']: - # get all entity ids in this batch - b_eids = batch_eids_list[batch_id.item()].copy() - - # remove this trigger to avoid self relation - b_eids.remove(trig_id.clone().detach()) - - # or only between trigger and entity - else: - # pair with only entity - b_eids = batch_ent_list[batch_id.item()].copy() - - # check empty - if len(b_eids) > 0: - # make pairs - batch_pair_idx = torch.tensor([[batch_id], [trig_id]]).repeat(1, len(b_eids)) - batch_pair_idx = torch.cat( - (batch_pair_idx, torch.tensor(b_eids).view(1, len(b_eids))), dim=0) - - # add to pairs - pairs_idx.append(batch_pair_idx) - - if len(pairs_idx) > 0: - pairs_idx = torch.cat(pairs_idx, dim=-1) - - return ent_embeds, tr_embeds, ent_types, tr_ids, pairs_idx - - def calculate(self, batch_input): - - # for output - ner_out = {} - - # input - nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, span_terms, \ - etypes, max_span_labels = batch_input - - # predict entity - e_preds, e_golds, sentence_sections, span_masks, embeddings, sentence_emb, trigger_indices = self.NER_layer( - all_tokens=nn_tokens, - all_ids=nn_ids, - all_token_masks=nn_token_mask, - all_attention_masks=nn_attention_mask, - all_entity_masks=nn_entity_masks, - all_trigger_masks=nn_trigger_masks, - all_span_labels=nn_span_labels, - ) - - # run on CPU - sentence_sections = sentence_sections.detach().cpu().numpy()[:-1] - all_span_masks = span_masks.detach() > 0 - - # Embedding of each span - embeddings = torch.split(embeddings, torch.sum(all_span_masks, dim=-1).tolist()) - - # Pred of each span - e_preds = np.split(e_preds.astype(int), sentence_sections) - e_preds = [pred.flatten() for pred in e_preds] - ner_out['preds'] = e_preds - - e_golds = np.split(e_golds.astype(int), sentence_sections) - e_golds = [gold.flatten() for gold in e_golds] - - # predict both entity and trigger - if self.params["ner_predict_all"]: - for items in span_terms: - items.term2id.clear() - items.id2term.clear() - - # Overwrite triggers - if self.trigger_id == -1: - self.trigger_id = utils.get_max_entity_id(span_terms) + 10000 - - trigger_idx = self.trigger_id + 1 - for sentence_idx, span_preds in enumerate(e_preds): - for pred_idx, label_id in enumerate(span_preds): - if label_id > 0: - term = "T" + str(trigger_idx) - - # check trigger - if label_id in self.params['mappings']['nn_mapping']['trTypes_Ids']: - term = "TR" + str(trigger_idx) - - span_terms[sentence_idx].id2term[pred_idx] = term - span_terms[sentence_idx].term2id[term] = pred_idx - trigger_idx += 1 - - self.trigger_id = trigger_idx - - # given gold entity, predict trigger only - else: - # Overwrite triggers - if self.trigger_id == -1: - self.trigger_id = utils.get_max_entity_id(span_terms) + 10000 - - trigger_idx = self.trigger_id + 1 - for sentence_idx, span_preds in enumerate(e_preds): - - # store gold entity index (a1) - a1ent_set = set() - - for span_idx, span_term in span_terms[sentence_idx].id2term.items(): - - # replace for entity (using gold entity label) - if span_term != "O" and not span_term.startswith("TR") and span_preds[span_idx] != 255: - - # but do not replace for entity in a2 files - span_label = span_terms[sentence_idx].id2label[span_idx] - if span_label not in self.params['a2_entities']: - span_preds[span_idx] = e_golds[sentence_idx][span_idx] - - # save this index to ignore prediction - a1ent_set.add(span_idx) - - for pred_idx, label_id in enumerate(span_preds): - span_term = span_terms[sentence_idx].id2term.get(pred_idx, "O") - - # if this entity in a1: skip this span - if pred_idx in a1ent_set: - continue - - remove_span = False - - # add prediction for trigger or entity a2 - if label_id > 0: - - term = '' - - # is trigger - if self.is_tr(label_id): - term = "TR" + str(trigger_idx) - - # is entity - else: - etype_label = self.params['mappings']['nn_mapping']['id_tag_mapping'][label_id] - - # check this entity type in a2 or not - if etype_label in self.params['a2_entities']: - term = "T" + str(trigger_idx) - else: - remove_span = True - - if len(term) > 0: - span_terms[sentence_idx].id2term[pred_idx] = term - span_terms[sentence_idx].term2id[term] = pred_idx - trigger_idx += 1 - - # null prediction - if label_id == 0 or remove_span: - - # do not write anything - span_preds[pred_idx] = 0 - - # remove this span - if span_term.startswith("T"): - del span_terms[sentence_idx].id2term[pred_idx] - del span_terms[sentence_idx].term2id[span_term] - - span_preds[span_preds == 255] = 0 - self.trigger_id = trigger_idx - - num_padding = max_span_labels * self.params["ner_label_limit"] - - e_preds = [np.pad(pred, (0, num_padding - pred.shape[0]), - 'constant', constant_values=-1) for pred in e_preds] - e_golds = [np.pad(gold, (0, num_padding - gold.shape[0]), - 'constant', constant_values=-1) for gold in e_golds] - - e_preds = torch.tensor(e_preds, device=self.device) - - embeddings = [f.pad(embedding, (0, 0, 0, max_span_labels - embedding.shape[0]), - 'constant', value=0) for embedding in embeddings] - - embeddings = torch.stack(embeddings) - embeddings = embeddings.unsqueeze(dim=2).expand(-1, -1, self.params["ner_label_limit"], -1) - embeddings = embeddings.reshape(embeddings.size(0), -1, embeddings.size(-1)) - - ent_embeds, tr_embeds, ent_types, tr_ids, pairs_idx = self.generate_entity_pairs_4rel( - embeddings, - preds=e_preds - ) - ner_preds = {'preds': e_preds, 'golds': e_golds, 'embeddings': embeddings, - 'ent_embeds': ent_embeds, 'tr_embeds': tr_embeds, 'tr_ids': tr_ids, - 'ent_types': ent_types, 'pairs_idx': pairs_idx, 'e_types': etypes.long(), - 'sentence_embeds': sentence_emb} - - rel_preds = self.REL_layer(ner_preds) - if rel_preds['next']: - - ev_preds, empty_pred = self.EV_layer(ner_preds, rel_preds) - - if empty_pred == True: - ev_preds = None - - - else: - rel_preds = None - ev_preds = None - - ner_out['terms'] = span_terms - ner_out['span_indices'] = nn_span_indices - - nner_preds = e_preds.detach().cpu().numpy() - ner_out['nner_preds'] = nner_preds - - return ner_out, rel_preds, ev_preds - - def forward(self, batch_input, parameters): - - ner_preds, rel_preds, ev_preds = self.calculate(batch_input) - - return ner_preds, rel_preds, ev_preds diff --git a/train.py b/train.py new file mode 100644 index 0000000..7163e15 --- /dev/null +++ b/train.py @@ -0,0 +1,215 @@ +import os +import random +import time + +import numpy as np +import torch +from torch.utils.data import ( + DataLoader, + RandomSampler, + SequentialSampler, + TensorDataset, +) + +from model import training + +from loader.prepData import prepdata +from loader.prepNN import mapping +from loader.prepNN import prep4nn + +from bert.optimization import BertAdam +from model import deepEM +from utils import utils + + +def main(): + # check running time + t_start = time.time() + + # set config path by command line + inp_args = utils._parsing() + config_path = getattr(inp_args, 'yaml') + + # set config path manually + # config_path = 'configs/default.yaml' + + with open(config_path, 'r') as stream: + parameters = utils._ordered_load(stream) + + # print config + utils._print_config(parameters, config_path) + + parameters['learning_rate'] = float(parameters['learning_rate']) + + if parameters['gpu'] >= 0: + device = torch.device("cuda:" + str(parameters['gpu']) if torch.cuda.is_available() else "cpu") + torch.cuda.set_device(parameters['gpu']) + else: + device = torch.device("cpu") + + print('device', device) + + parameters['device'] = device + + # Fix seed for reproducibility + os.environ["PYTHONHASHSEED"] = str(parameters['seed']) + random.seed(parameters['seed']) + np.random.seed(parameters['seed']) + torch.manual_seed(parameters['seed']) + + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Init needed params + parameters['max_ev_per_batch'] = 0 + parameters['max_ev_per_layer'] = 0 + parameters['max_rel_per_ev'] = 0 + parameters['max_ev_per_tr'] = 0 + + # Force predict = False + parameters['predict'] = False + + # 1. process data + train_data = prepdata.prep_input_data(parameters['train_data'], parameters) + dev_data = prepdata.prep_input_data(parameters['dev_data'], parameters) + + # fix bug for mlee + test_data = prepdata.prep_input_data(parameters['test_data'], parameters) + + # mapping + parameters = mapping.generate_map(train_data, dev_data, test_data, parameters) # add test data for mlee + if len(parameters['mappings']['rel_map']) > 0: + parameters = mapping.find_ignore_label(parameters) + + # nner: + parameters['mappings']['nn_mapping'] = utils.gen_nn_mapping(parameters['mappings']['tag_map'], + parameters['mappings']['tag2type_map'], + parameters['trTypes_Ids']) + + train, train_events_map = prep4nn.data2network(train_data, 'train', parameters) + dev, dev_events_map = prep4nn.data2network(dev_data, 'demo', parameters) + + if len(train) == 0: + raise ValueError("Train set empty.") + if len(dev) == 0: + raise ValueError("Test set empty.") + + # For ranking REL labels weight + parameters['statistics'] = {'rel': np.zeros(parameters['voc_sizes']['rel_size'])} + + train_data = prep4nn.torch_data_2_network(cdata2network=train, events_map=train_events_map, params=parameters, + do_get_nn_data=True) + dev_data = prep4nn.torch_data_2_network(cdata2network=dev, events_map=dev_events_map, params=parameters, + do_get_nn_data=True) + + trn_data_size = len(train_data['nn_data']['ids']) + dev_data_size = len(dev_data['nn_data']['ids']) + + train_data_ids = TensorDataset(torch.arange(trn_data_size)) + dev_data_ids = TensorDataset(torch.arange(dev_data_size)) + train_sampler = RandomSampler(train_data_ids) + train_dataloader = DataLoader(train_data_ids, sampler=train_sampler, batch_size=parameters['batchsize']) + dev_sampler = SequentialSampler(dev_data_ids) + dev_dataloader = DataLoader(dev_data_ids, sampler=dev_sampler, batch_size=parameters['batchsize']) + + # 2. model + model = deepEM.DeepEM(parameters) + + # Continue training joint model + if not parameters['predict']: + # Load pre-trained models + if 'joint_model_dir' in parameters: + print('Continue training joint model from', parameters['joint_model_dir']) + utils.handle_checkpoints(model=model, + checkpoint_dir=parameters['joint_model_dir'], + params={ + 'device': device + }, + resume=True) + if 'ner_model_dir' in parameters: + print('pre-load NER model from', parameters['ner_model_dir']) + utils.handle_checkpoints(model=model.NER_layer, + checkpoint_dir=parameters['ner_model_dir'], + params={ + 'device': device + }, + resume=True) + + if 'rel_model_dir' in parameters: + print('pre-load REL model from', parameters['rel_model_dir']) + utils.handle_checkpoints(model=model.REL_layer, + checkpoint_dir=parameters['rel_model_dir'], + params={ + 'device': device + }, + resume=True) + + if 'ev_model_dir' in parameters: + print('pre-load EV model from', parameters['ev_model_dir']) + utils.handle_checkpoints(model=model.EV_layer, + checkpoint_dir=parameters['ev_model_dir'], + params={ + 'device': device + }, + resume=True) + + # 3. optimizer + assert ( + parameters['gradient_accumulation_steps'] >= 1 + ), "Invalid gradient_accumulation_steps parameter, should be >= 1." + + parameters['batchsize'] //= parameters['gradient_accumulation_steps'] + + num_train_steps = parameters['epoch'] * ( + (trn_data_size - 1) // (parameters['batchsize'] * parameters['gradient_accumulation_steps']) + 1) + parameters['voc_sizes']['num_train_steps'] = num_train_steps + + model.to(device) + + # Prepare optimizer + + ner_params, rel_params, ev_params = utils.partialize_optimizer_models_parameters(model) + param_optimizers = ner_params + optimizer_grouped_parameters = utils.gen_optimizer_grouped_parameters(param_optimizers, "ner", parameters) + rel_grouped_params = utils.gen_optimizer_grouped_parameters(rel_params, "rel", parameters) + ev_grouped_params = utils.gen_optimizer_grouped_parameters(ev_params, "ev", parameters) + + if parameters['bert_warmup_lr']: + t_total = num_train_steps + else: + t_total = -1 + + optimizer = BertAdam( + optimizer_grouped_parameters, + lr=parameters['learning_rate'], + warmup=parameters['warmup_proportion'], + t_total=t_total + ) + + optimizer.add_param_group(rel_grouped_params[0]) + optimizer.add_param_group(rel_grouped_params[1]) + optimizer.add_param_group(ev_grouped_params[0]) + optimizer.add_param_group(ev_grouped_params[1]) + + if parameters['train']: + # 4. training + + if parameters['fp16']: + model, optimizer = amp.initialize(model, optimizer, opt_level="O1") + + training.train(train_data_loader=train_dataloader, dev_data_loader=dev_dataloader, + train_data=train_data, dev_data=dev_data, params=parameters, model=model, + optimizer=optimizer) + + print('TRAINING: DONE!') + + # calculate running time + t_end = time.time() + print('TOTAL RUNNING TIME: {}'.format(utils._humanized_time(t_end - t_start))) + + return + + +if __name__ == '__main__': + + main() From b3f7e617760cb3360fec26e2804aa49be7bf1896 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:35:36 +0900 Subject: [PATCH 15/70] training --- README.md | 19 ++ utils/c2t_utils.py | 245 +++++++++++++++++++ utils/utils.py | 574 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 797 insertions(+), 41 deletions(-) create mode 100644 utils/c2t_utils.py diff --git a/README.md b/README.md index d9a3eff..84fc031 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,25 @@ sh run/train/generate_configs.sh cg basic sh run/train/generate_configs-debug.sh cg debug ``` +4. Training +- Pretrain layers (these need to be done before training the joint model) +```bash +sh run.sh cg basic train ner +sh run.sh cg basic train rel +sh run.sh cg basic train ev +``` + +- Train joint model: given gold entity +```bash +sh run.sh cg basic train joint gold +``` + +- Train joint end-to-end model + +```bash +sh run.sh cg basic train joint e2e +``` + # 3. Predict (BioNLP tasks) ## 3.1. Prepare data diff --git a/utils/c2t_utils.py b/utils/c2t_utils.py new file mode 100644 index 0000000..c0d6081 --- /dev/null +++ b/utils/c2t_utils.py @@ -0,0 +1,245 @@ +import torch + +# C2T: Using to padding +MAX_NESTED = 4 +PAIR_SIZE = 2 + +# Type padding +TYPE_TRID = 0 +TYPE_ARG1 = 1 +TYPE_ARG2 = 2 +TYPE_LBL = 4 +TYPE_TR0ID = 5 + + +def _is_contain(input, target): + """ C2T: Check and return the index if Tensor target(list) contains Tensor input + """ + for i, e in enumerate(target): + if torch.all(torch.eq(e, input)): + return i + return -1 + + +# C2T: Padding +# C2T +def _truncate(arr, max_length): + while True: + total_length = len(arr) + if total_length <= max_length: + break + else: + arr.pop() + + +def _padding(arr, max_length, padding_idx=-1): + while len(arr) < max_length: + arr.append(padding_idx) + + +def _to_tensor(arr, params): + return torch.tensor(arr, device=params['device']) + + +def _to_torch_data(arr, max_length, params, padding_idx=-1): + for e in arr: + _truncate(e, max_length) + _padding(e, max_length, padding_idx=padding_idx) + return _to_tensor(arr, params) + + +def _padding_rels(rels, max_rel_per_event): + """ C2T: Padding relations + """ + padded_rels = [] + for rel in rels: + padded_rels.append(rel) + while len(padded_rels) < max_rel_per_event: + padded_rels.append([-1] * PAIR_SIZE) + return padded_rels + + +def _padding_cell_1_value(val, cols, rows, padding_val=-1): + """ C2T: Padding cells that only have 1 value + """ + padded_cell = [] + padded_row = [] + padded_row.append(val) + while len(padded_row) < cols: + padded_row.append(padding_val) + padded_cell.append(padded_row) + while len(padded_cell) < rows: + padded_cell.append([padding_val] * cols) + return padded_cell + + +def _padding_even(even, max_rel_per_event): + max_cell = 4 + padding_val = -1 + padded_even = [] + # padding trid + trid = even[0] + padded_trid = [_padding_cell_1_value(trid[0], PAIR_SIZE, max_rel_per_event, padding_val=padding_val), + _padding_cell_1_value(trid[1], PAIR_SIZE, max_rel_per_event, padding_val=padding_val)] + while len(padded_trid) < max_cell: + padded_trid.append(_padding_cell_1_value(padding_val, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + # print('padded_trid', padded_trid) + padded_even.append(padded_trid) + # padding arg1 + arg1 = even[1] + padded_arg1 = [] + for e in arg1: + padded_arg1.append(_padding_cell_1_value(e, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + # print('padded_arg1', padded_arg1) + padded_even.append(padded_arg1) + # padding arg2 + arg2 = even[2] + padded_arg2 = [] + for e in arg2: + padded_arg2.append(_padding_cell_1_value(e, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + while len(padded_arg2) < max_cell: + padded_arg2.append(_padding_cell_1_value(padding_val, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + # print('padded_arg2', padded_arg2) + padded_even.append(padded_arg2) + # padding r + r = even[3] + padded_r = [] + for e in r: + if e != -1: + padded_r.append(_padding_rels([e], max_rel_per_event)) + else: + padded_r.append(_padding_cell_1_value(padding_val, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + while len(padded_r) < max_cell: + padded_r.append(_padding_cell_1_value(padding_val, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + padded_even.append(padded_r) + # padding label + lbl = even[4] + padded_lbl = [_padding_cell_1_value(lbl, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)] + while len(padded_lbl) < max_cell: + padded_lbl.append(_padding_cell_1_value(padding_val, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + padded_even.append(padded_lbl) + + # padding tr0id + if len(even) > 5: + tr0id = even[5] + padded_tr0id = [] + for e in tr0id: + padded_tr0id.append(_padding_cell_1_value(e, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + while len(padded_tr0id) < max_cell: + padded_tr0id.append( + _padding_cell_1_value(padding_val, PAIR_SIZE, max_rel_per_event, padding_val=padding_val)) + padded_even.append(padded_tr0id) + + return padded_even + + +def _padding_truth_ev(arr, max_rel_per_event, max_ev_per_layer, max_ev_per_batch): + padded_arr = [] + for row in arr: + padded_row = [] + for cell in row: + padded_cell = [] + if cell != -1: + for ev in cell: + r_e = ev[0] + e_e = ev[1] + padded_r_e = [] + for r_e_e in r_e: + while len(r_e_e) < max_rel_per_event: + r_e_e = r_e_e + [-1] + padded_r_e.append(r_e_e) + while len(padded_r_e) < max_rel_per_event: + padded_r_e.append([-1] * max_rel_per_event) + while len(e_e) < max_rel_per_event: + e_e = e_e + [-1] + padded_r_e.append(e_e) + padded_cell.append(padded_r_e) + while len(padded_cell) < max_ev_per_layer: + padded_cell.append([[-1] * max_rel_per_event] * (max_rel_per_event + 1)) + else: + padded_cell = [[[-1] * max_rel_per_event] * (max_rel_per_event + 1)] * max_ev_per_layer + padded_row.append(padded_cell) + padded_arr.append(padded_row) + while len(padded_arr) < max_ev_per_batch: + padded_arr.append([[[[-1] * max_rel_per_event] * (max_rel_per_event + 1)] * max_ev_per_layer] * MAX_NESTED) + return padded_arr + + +def _flatten_structs_type_ev(arr, max_rel_per_event, max_ev_per_tr): + padded_arr = [] + for cell in arr: + padded_cell = [] + if cell != -1: + for ev in cell: + padded_r_e = [] + for r_e_e in ev: + # while len(r_e_e) < PAIR_SIZE: + # r_e_e = r_e_e + [-1] + padded_r_e.append(r_e_e) + while len(padded_r_e) < max_rel_per_event: + padded_r_e.append([-1] * PAIR_SIZE) + # if len(padded_r_e) > 2: + padded_cell.append(padded_r_e) + while len(padded_cell) < max_ev_per_tr: + padded_cell.append([[-1] * PAIR_SIZE] * max_rel_per_event) + else: + padded_cell = [[[-1] * PAIR_SIZE] * max_rel_per_event] * max_ev_per_tr + padded_arr.append(padded_cell) + return padded_arr + + +def _padding_even_cd(even_cd, max_rel_per_event, dtype, device=torch.device("cpu")): + padded_even = [] + r_part = even_cd[0] + e_part = even_cd[1] + for r in r_part: + padded_r = [] + for r_e in r: + padded_r.append(r_e) + while len(padded_r) < max_rel_per_event: + padded_r.append(-1) + padded_even.append(padded_r) + while len(padded_even) < max_rel_per_event: + padded_even.append([-1] * max_rel_per_event) + padded_e = [] + for e in e_part: + padded_e.append(e) + while len(padded_e) < max_rel_per_event: + padded_e.append(-1) + padded_even.append(padded_e) + return torch.tensor(padded_even, dtype=dtype, device=device) + + +# C2T: Un-padding +def _unpadding_cell_1_value(padded_cell, cols, rows, device, padding_val=-1, replacing_padding=-1): + padding_cell = torch.tensor([[padding_val] * cols] * rows, device=device) + if torch.all(torch.eq(padded_cell.long(), padding_cell)): + return replacing_padding + else: + return padded_cell[0][0] + + +def _unpadding_even_element(padded_ev_e, max_rel_per_event, device, type_padding=0, replacing_padding=-1): + unpadded_ev_e = [] + padding_val = -1 + if type_padding == TYPE_TRID: + valid_idx = 1 + elif type_padding == TYPE_ARG1: + valid_idx = 3 + elif type_padding == TYPE_ARG2: + valid_idx = 2 + elif type_padding == TYPE_LBL: + valid_idx = 0 + elif type_padding == TYPE_TR0ID: + valid_idx = 2 + + for e in padded_ev_e: + unpadded_e = [] + for i, cell in enumerate(e): + cell = cell.to(device) + if i <= valid_idx: + unpadded_e.append( + _unpadding_cell_1_value(cell, PAIR_SIZE, max_rel_per_event, device, padding_val=padding_val, + replacing_padding=replacing_padding)) + unpadded_ev_e.append(unpadded_e) + return torch.tensor(unpadded_ev_e, device=device) diff --git a/utils/utils.py b/utils/utils.py index 63983f9..e52edd5 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -7,40 +7,24 @@ import pprint import random import re +import shutil from collections import OrderedDict from datetime import datetime from glob import glob +import math import numpy as np import torch +# C2T import yaml -logger = logging.getLogger(__name__) - - -def _to_torch_data(arr, max_length, params, padding_idx=-1): - for e in arr: - _truncate(e, max_length) - _padding(e, max_length, padding_idx=padding_idx) - return _to_tensor(arr, params) - - -def _truncate(arr, max_length): - while True: - total_length = len(arr) - if total_length <= max_length: - break - else: - arr.pop() +from utils import c2t_utils - -def _padding(arr, max_length, padding_idx=-1): - while len(arr) < max_length: - arr.append(padding_idx) +logger = logging.getLogger(__name__) -def _to_tensor(arr, params): - return torch.tensor(arr, device=params['device']) +def gelu(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(math.pi / 2) * (x + 0.044715 * x ** 3))) def path(*paths): @@ -49,7 +33,7 @@ def path(*paths): def make_dirs(*paths): os.makedirs(path(*paths), exist_ok=True) - + def makedir(dir): if not os.path.exists(dir): @@ -74,6 +58,14 @@ def _parsing(): return args +def _parsing_opt(): + parser = argparse.ArgumentParser() + parser.add_argument('--yaml', type=str, required=True, help='yaml file') + parser.add_argument('--opt', type=str, required=True, help='yaml opt file') + args = parser.parse_args() + return args + + def _ordered_load(stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict): """ Load parameters from yaml in order @@ -89,9 +81,113 @@ def construct_mapping(loader, node): OrderedLoader.add_constructor( yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping) + + # print(dict(yaml.load(stream, OrderedLoader).items())) + return yaml.load(stream, OrderedLoader) +def _print_config(config, config_path): + """Print config in dictionary format""" + print("\n====================================================================\n") + print('RUNNING CONFIG: ', config_path) + print('TIME: ', datetime.now()) + + for key, value in config.items(): + print(key, value) + + return + + +def dicard_invalid_nes(terms, sentences): + """ + Discard incomplete tokenized entities. + """ + text = ' '.join(sentences) + valid_terms = [] + count = 0 + for term in terms: + start, end = int(term[2]), int(term[3]) + if start == 0: + if text[end] == ' ': + valid_terms.append(term) + else: + count += 1 + # print('Context:{}\t{}'.format(text[start:end + 1], term)) + elif text[start - 1] == ' ' and text[end] == ' ': + valid_terms.append(term) + else: + count += 1 + # print('Context:{}\t{}'.format(text[start-1:end+1], term)) + return valid_terms, count + + +def _humanized_time(second): + """ + Returns a human readable time. + """ + m, s = divmod(second, 60) + h, m = divmod(m, 60) + return "%dh %02dm %02ds" % (h, m, s) + + +def is_best_epoch(prf_): + fs = [] + for epoch, (p, r, f) in enumerate(prf_): + fs.append(f) + + if len(fs) == 1: + return True + + elif max(fs[:-1]) < fs[-1]: + return True + + else: + return False + + +def extract_scores(task, prf_): + ps = [] + rs = [] + fs = [] + for epoch, (p, r, f) in enumerate(prf_): + ps.append(p) + rs.append(r) + fs.append(f) + + maxp = max(ps) + maxr = max(rs) + maxf = max(fs) + + maxp_index = ps.index(maxp) + maxr_index = rs.index(maxr) + maxf_index = fs.index(maxf) + + print('TASK: ', task) + print('precision: ', ps) + print('recall: ', rs) + print('fscore: ', fs) + print('best precision/recall/fscore [epoch]: ', maxp, ' [', maxp_index, ']', '\t', maxr, ' [', maxr_index, ']', + '\t', maxf, ' [', maxf_index, ']') + print() + + return (maxp, maxr, maxf) + + +def write_best_epoch(result_dir): + # best_dir = params['ev_setting'] + params['ev_eval_best'] + best_dir = result_dir + 'ev-best/' + + if os.path.exists(best_dir): + os.system('rm -rf ' + best_dir) + # else: + # os.makedirs(best_dir) + + current_dir = result_dir + 'ev-last/' + + shutil.copytree(current_dir, best_dir) + + def dumps(obj): if isinstance(obj, dict): return json.dumps(obj, indent=4, ensure_ascii=False) @@ -100,6 +196,10 @@ def dumps(obj): return obj +def debug(*args, **kwargs): + print(*map(dumps, args), **kwargs) + + def get_max_entity_id(span_terms): max_id = 0 for items in span_terms: @@ -110,8 +210,107 @@ def get_max_entity_id(span_terms): return max_id +def gen_nn_mapping(tag2id_mapping, tag2type_map, trTypes_Ids): + nn_tr_types_ids = [] + nn_tag_2_type = {} + tag_names = [] + for tag, _id in tag2id_mapping.items(): + if tag.startswith("I-"): + continue + tag_names.append(re.sub("^B-", "", tag)) + if tag2type_map[_id] in trTypes_Ids: + nn_tr_types_ids.append(len(tag_names) - 1) + + nn_tag_2_type[len(tag_names) - 1] = tag2type_map[_id] + + id_tag_mapping = {k: v for k, v in enumerate(tag_names)} + tag_id_mapping = {v: k for k, v in id_tag_mapping.items()} + + # For multi-label nner + assert all(_id == tr_id for _id, tr_id in + zip(sorted(id_tag_mapping)[1:], nn_tr_types_ids)), "Trigger IDS must be continuous and on the left side" + return {'id_tag_mapping': id_tag_mapping, 'tag_id_mapping': tag_id_mapping, 'trTypes_Ids': nn_tr_types_ids, + 'tag2type_map': nn_tag_2_type} + + +def padding_samples_lstm(tokens_, ids_, token_mask_, attention_mask_, span_indices_, span_labels_, + span_labels_match_rel_, + entity_masks_, trigger_masks_, gtruth_, l2r_, ev_idxs_, params): + # count max lengths: + max_seq = 0 + for ids in ids_: + max_seq = max(max_seq, len(ids)) + + max_span_labels = 0 + for span_labels in span_labels_: + max_span_labels = max(max_span_labels, len(span_labels)) + + for idx, ( + tokens, ids, token_mask, attention_mask, span_indices, span_labels, span_labels_match_rel, entity_masks, + trigger_masks, gtruth, l2r, ev_idxs) in enumerate( + zip(tokens_, + ids_, + token_mask_, + attention_mask_, + span_indices_, + span_labels_, + span_labels_match_rel_, + entity_masks_, + trigger_masks_, + gtruth_, + l2r_, + ev_idxs_)): + padding_size = max_seq - len(ids) + + tokens += [""] * padding_size + + # Zero-pad up to the sequence length + ids += [0] * padding_size + token_mask += [0] * padding_size + attention_mask += [0] * padding_size + + # Padding for gtruth and l2r + # gtruth = np.pad(gtruth, ( + # (0, max_span_labels - len(span_indices)), (0, max_span_labels - len(span_indices))), + # 'constant', constant_values=-1) + + # l2r = np.pad(l2r, + # ((0, max_span_labels - len(span_indices)), + # (0, max_span_labels - len(span_indices))), + # 'constant', constant_values=-1) + + # Padding for span indices and labels + num_padding_spans = max_span_labels - len(span_labels) + + span_indices += [(-1, -1)] * (num_padding_spans * params["ner_label_limit"]) + span_labels += [np.zeros(params["mappings"]["nn_mapping"]["num_labels"])] * num_padding_spans + span_labels_match_rel += [-1] * num_padding_spans + entity_masks += [-1] * num_padding_spans + trigger_masks += [-1] * num_padding_spans + + # ev_idxs = np.pad(ev_idxs, (0, params['max_span_labels'] - len(ev_idxs)), 'constant', constant_values=-1) + # ev_idxs = np.array(ev_idxs) + + gtruth_[idx] = gtruth + l2r_[idx] = l2r + ev_idxs_[idx] = ev_idxs + + assert len(ids) == max_seq + assert len(token_mask) == max_seq + assert len(attention_mask) == max_seq + assert len(span_indices) == max_span_labels * params["ner_label_limit"] + assert len(span_labels) == max_span_labels + assert len(span_labels_match_rel) == max_span_labels + assert len(entity_masks) == max_span_labels + assert len(trigger_masks) == max_span_labels + # assert len(gtruth_[idx][0]) == max_span_labels + # assert len(l2r_[idx][0]) == max_span_labels + + return max_span_labels + + def padding_samples(ids_, token_mask_, attention_mask_, span_indices_, span_labels_, span_labels_match_rel_, - entity_masks_, trigger_masks_, params): + entity_masks_, trigger_masks_, gtruth_, l2r_, ev_idxs_, params): # count max lengths: max_seq = 0 for ids in ids_: @@ -123,7 +322,7 @@ def padding_samples(ids_, token_mask_, attention_mask_, span_indices_, span_labe for idx, ( ids, token_mask, attention_mask, span_indices, span_labels, span_labels_match_rel, entity_masks, - trigger_masks) in enumerate( + trigger_masks, gtruth, l2r, ev_idxs) in enumerate( zip( ids_, token_mask_, @@ -133,7 +332,9 @@ def padding_samples(ids_, token_mask_, attention_mask_, span_indices_, span_labe span_labels_match_rel_, entity_masks_, trigger_masks_, - )): + gtruth_, + l2r_, + ev_idxs_)): padding_size = max_seq - len(ids) # Zero-pad up to the sequence length @@ -141,6 +342,16 @@ def padding_samples(ids_, token_mask_, attention_mask_, span_indices_, span_labe token_mask += [0] * padding_size attention_mask += [0] * padding_size + # Padding for gtruth and l2r + # gtruth = np.pad(gtruth, ( + # (0, max_span_labels - len(span_indices)), (0, max_span_labels - len(span_indices))), + # 'constant', constant_values=-1) + + # l2r = np.pad(l2r, + # ((0, max_span_labels - len(span_indices)), + # (0, max_span_labels - len(span_indices))), + # 'constant', constant_values=-1) + # Padding for span indices and labels num_padding_spans = max_span_labels - len(span_labels) @@ -150,6 +361,13 @@ def padding_samples(ids_, token_mask_, attention_mask_, span_indices_, span_labe entity_masks += [-1] * num_padding_spans trigger_masks += [-1] * num_padding_spans + # ev_idxs = np.pad(ev_idxs, (0, params['max_span_labels'] - len(ev_idxs)), 'constant', constant_values=-1) + # ev_idxs = np.array(ev_idxs) + + gtruth_[idx] = gtruth + l2r_[idx] = l2r + ev_idxs_[idx] = ev_idxs + assert len(ids) == max_seq assert len(token_mask) == max_seq assert len(attention_mask) == max_seq @@ -158,12 +376,86 @@ def padding_samples(ids_, token_mask_, attention_mask_, span_indices_, span_labe assert len(span_labels_match_rel) == max_span_labels assert len(entity_masks) == max_span_labels assert len(trigger_masks) == max_span_labels + # assert len(gtruth_[idx][0]) == max_span_labels + # assert len(l2r_[idx][0]) == max_span_labels return max_span_labels +def partialize_optimizer_models_parameters(model): + """ + Partialize entity, relation and event models parameters from optimizer's parameters + """ + ner_params = list(model.NER_layer.named_parameters()) + rel_params = list(model.REL_layer.named_parameters()) + ev_params = list(model.EV_layer.named_parameters()) + + return ner_params, rel_params, ev_params + + +def gen_optimizer_grouped_parameters(param_optimizers, name, params): + no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] + if not params['bert_warmup_lr']: + lr = float(params['ner_lr']) + if name == 'rel': + lr = float(params['rel_lr']) + if name == 'ev': + lr = float(params['ev_lr']) + else: + lr = params['learning_rate'] + + optimizer_grouped_parameters = [ + { + "name": name, + "params": [ + p + for n, p in param_optimizers + if not any(nd in n for nd in no_decay) + ], + "weight_decay": 0.01, + "lr": lr + }, + { + "name": name, + "params": [ + p + for n, p in param_optimizers + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + "lr": lr + }, + ] + + return optimizer_grouped_parameters + + +def prepare_optimizer_parameters(optimizer, rel_params, ev_params, conf_params, epoch): + if not conf_params['skip_ner']: + if epoch == conf_params['ner_epoch'] + 1: + print("Adding optimizer's REL model params") + rel_grouped_params = gen_optimizer_grouped_parameters(rel_params, "rel", conf_params) + optimizer.add_param_group(rel_grouped_params[0]) + optimizer.add_param_group(rel_grouped_params[1]) + if not conf_params['skip_rel']: + if epoch == conf_params['rel_epoch'] + 1: + print("Adding optimizer's EV model params") + ev_grouped_params = gen_optimizer_grouped_parameters(ev_params, "ev", conf_params) + optimizer.add_param_group(ev_grouped_params[0]) + optimizer.add_param_group(ev_grouped_params[1]) + else: + pass + + def get_tensors(data_ids, data, params): - tokens = [] + # for lstm + if params['use_lstm']: + tokens = [ + data["nn_data"]["tokens"][tr_data_id] + for tr_data_id in data_ids[0].tolist() + ] + else: + tokens = [] ids = [ data["nn_data"]["ids"][tr_data_id] @@ -198,12 +490,32 @@ def get_tensors(data_ids, data, params): data["nn_data"]["trigger_masks"][tr_data_id] for tr_data_id in data_ids[0].tolist() ] + gtruths = [ + data["nn_data"]["gtruth"][tr_data_id] + for tr_data_id in data_ids[0].tolist() + ] + l2rs = [ + data["nn_data"]["l2r"][tr_data_id] + for tr_data_id in data_ids[0].tolist() + ] span_terms = [ data["nn_data"]["span_terms"][tr_data_id] for tr_data_id in data_ids[0].tolist() ] + truth_evs = [ + data["nn_data"]["truth_ev"][tr_data_id] + for tr_data_id in data_ids[0].tolist() + ] + ev_idxs = [ + data["nn_data"]["ev_idxs"][tr_data_id] + for tr_data_id in data_ids[0].tolist() + ] + ev_lbls = [ + data["nn_data"]["ev_lbls"][tr_data_id] + for tr_data_id in data_ids[0].tolist() + ] etypes = [data["etypes"][tr_data_id] for tr_data_id in data_ids[0].tolist()] tokens = copy.deepcopy(tokens) @@ -215,22 +527,50 @@ def get_tensors(data_ids, data, params): span_labels_match_rel = copy.deepcopy(span_labels_match_rel) entity_masks = copy.deepcopy(entity_masks) trigger_masks = copy.deepcopy(trigger_masks) + gtruths = copy.deepcopy(gtruths) + l2rs = copy.deepcopy(l2rs) span_terms = copy.deepcopy(span_terms) + truth_evs = copy.deepcopy(truth_evs) + ev_idxs = copy.deepcopy(ev_idxs) + etypes = copy.deepcopy(etypes) + + # use lstm + if params['use_lstm']: + max_span_labels = padding_samples_lstm( + tokens, + ids, + token_masks, + attention_masks, + span_indices, + span_labels, + span_labels_match_rel, + entity_masks, + trigger_masks, + gtruths, + l2rs, + ev_idxs, + params + ) - max_span_labels = padding_samples( - ids, - token_masks, - attention_masks, - span_indices, - span_labels, - span_labels_match_rel, - entity_masks, - trigger_masks, - params - ) + # use bert + else: + max_span_labels = padding_samples( + ids, + token_masks, + attention_masks, + span_indices, + span_labels, + span_labels_match_rel, + entity_masks, + trigger_masks, + gtruths, + l2rs, + ev_idxs, + params + ) # Padding etypes - etypes = _to_torch_data(etypes, max_span_labels, params) + etypes = c2t_utils._to_torch_data(etypes, max_span_labels, params) batch_ids = torch.tensor(ids, dtype=torch.long, device=params["device"]) batch_token_masks = torch.tensor( @@ -255,6 +595,11 @@ def get_tensors(data_ids, data, params): trigger_masks, dtype=torch.int8, device=params["device"] ) + batch_gtruths = gtruths + batch_l2rs = l2rs + batch_truth_evs = truth_evs + batch_ev_idxs = ev_idxs + return ( tokens, batch_ids, @@ -265,7 +610,12 @@ def get_tensors(data_ids, data, params): batch_span_labels_match_rel, batch_entity_masks, batch_trigger_masks, - span_terms, + batch_gtruths, + batch_l2rs, + span_terms, # ! << KHOA WAS HERE + batch_truth_evs, + batch_ev_idxs, + ev_lbls, etypes, max_span_labels ) @@ -411,6 +761,9 @@ def read_lines(filename): def write_lines(lines, filename, linesep="\n"): is_first_line = True + # make_dirs(os.path.dirname(filename)) + # os.makedirs(filename) + # with open(abs_path(filename), "w", encoding="UTF-8") as f: with open(filename, "w", encoding="UTF-8") as f: for line in lines: if is_first_line: @@ -418,3 +771,142 @@ def write_lines(lines, filename, linesep="\n"): else: f.write(linesep) f.write(line) + + # fig bug that not write file with empty prediction + # if len(lines) == 0: + # print(filename) + # f.write(linesep) + + +def list_compare(left, right): + """ + Failed cases: + a = np.array([[1,2,3], [4,5,6]]) + b = np.array([[1,2,3], [4,5,6]]) + # => Expected value: True + + a = np.array([[1,2,3], [4,5,6]]) + b = np.array([[1,2,3], np.array([4,5,6])]) + # => Expected value: True + + a = [np.array([1,2,3]), np.array([4,5,6])] + b = [np.array([1,2,3]), np.array([4,5,6])] + # => Expected value: True + + a = np.array([[1,2,3], [1,2,3]]) + b = np.array([[1,2,3]]) + # => Expected value: False + """ + if isinstance(left, np.ndarray): + left = left.tolist() + + if isinstance(right, np.ndarray): + right = right.tolist() + + if (isinstance(right, list) and not isinstance(left, list)) or ( + isinstance(left, list) and not isinstance(right, list)): + return False + + try: + return left == right + except: + try: + if len(left) == len(right): + for left_, right_ in zip(left, right): + if not list_compare(left_, right_): + return False + return True + else: + return False + except: + return False + + +def compare_event_truth(ev, truth): + if isinstance(ev, list) and isinstance(truth, list): + ev_args = sort_ev_args(ev, truth) + if ev_args: + truth_args = truth[1] + return compare_args(ev_args, truth_args) + else: + return list_compare(ev, truth) + + +def sort_ev_args(ev, truth): + if len(ev[0]) != len(truth[0]): + return None + ev_can = ev[0] + truth_can = truth[0] + ev_args = ev[1] + ev_sorted_args = [] + for can in truth_can: + if can in ev_can: + ev_sorted_args.append(ev_args[ev_can.index(can)]) + else: + return None + + return ev_sorted_args + + +def compare_args(ev_args, truth_args): + if isinstance(ev_args, np.ndarray): + ev_args = ev_args.tolist() + + if isinstance(truth_args, np.ndarray): + truth_args = truth_args.tolist() + + if isinstance(ev_args, list) and isinstance(truth_args, list): + if len(ev_args) != len(truth_args): + return False + for ev_arg, truth_arg in zip(ev_args, truth_args): + if not compare_event_truth(ev_arg, truth_arg): + return False + return True + else: + return False + + +def write_annotation_file( + ann_file, entities=None, triggers=None, relations=None, events=None +): + lines = [] + + def annotate_text_bound(entities): + for entity in entities.values(): + entity_annotation = "{}\t{} {} {}\t{}".format( + entity["id"], + entity["type"], + entity["start"], + entity["end"], + entity["ref"], + ) + lines.append(entity_annotation) + + if entities: + annotate_text_bound(entities) + + if triggers: + annotate_text_bound(triggers) + + if relations: + for relation in relations.values(): + relation_annotation = "{}\t{} {}:{} {}:{}".format( + relation["id"], + relation["role"], + relation["left_arg"]["label"], + relation["left_arg"]["id"], + relation["right_arg"]["label"], + relation["right_arg"]["id"], + ) + lines.append(relation_annotation) + + if events: + for event in events.values(): + event_annotation = "{}\t{}:{}".format( + event["id"], event["trigger_type"], event["trigger_id"] + ) + for arg in event["args"]: + event_annotation += " {}:{}".format(arg["role"], arg["id"]) + lines.append(event_annotation) + + write_lines(lines, ann_file) From b56641205ef5e7cd2379b840385e6ca208f55584 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:38:07 +0900 Subject: [PATCH 16/70] readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 84fc031..64922c8 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ sh run/train/generate_configs-debug.sh cg debug 4. Training - Pretrain layers (these need to be done before training the joint model) +- Replace "basic" by "debug" to quickly try experiments on the small data (debug mode) ```bash sh run.sh cg basic train ner sh run.sh cg basic train rel From b2734fb02f85d9f18d82a733124b903664738d82 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:52:06 +0900 Subject: [PATCH 17/70] training scripts --- README.md | 10 +++++----- run/train/train.sh | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 run/train/train.sh diff --git a/README.md b/README.md index 64922c8..71eaf9a 100644 --- a/README.md +++ b/README.md @@ -97,20 +97,20 @@ sh run/train/generate_configs-debug.sh cg debug - Pretrain layers (these need to be done before training the joint model) - Replace "basic" by "debug" to quickly try experiments on the small data (debug mode) ```bash -sh run.sh cg basic train ner -sh run.sh cg basic train rel -sh run.sh cg basic train ev +sh run/train/train.sh experiments/cg/basic/configs/train-ner.yaml +sh run/train/train.sh experiments/cg/basic/configs/train-rel.yaml +sh run/train/train.sh experiments/cg/basic/configs/train-ev.yaml ``` - Train joint model: given gold entity ```bash -sh run.sh cg basic train joint gold +sh run/train/train.sh experiments/cg/basic/configs/train-joint-gold.yaml ``` - Train joint end-to-end model ```bash -sh run.sh cg basic train joint e2e +sh run/train/train.sh experiments/cg/basic/configs/train-joint-e2e.yaml ``` # 3. Predict (BioNLP tasks) diff --git a/run/train/train.sh b/run/train/train.sh new file mode 100644 index 0000000..91193d6 --- /dev/null +++ b/run/train/train.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +ROOT=$PWD + +export PYTHONPATH="${PYTHONPATH}:$ROOT" +export PYTHONPATH="${PYTHONPATH}:$ROOT/model" + +CONFIG_PATH=$1 + +CONFIG_NAME=$(basename "$CONFIG_PATH") +CONFIG_NAME=(${CONFIG_NAME//./ }) + +CONFIG_DIR=$(dirname "$YAML_PATH") +EXP_DIR=$(dirname "$CONFIG_DIR") +LOG_DIR=$EXP_DIR/logs +mkdir -p $LOG_DIR + +nohup python -u main.py --yaml $YAML_PATH >> $LOG_DIR/$CONFIG_NAME.log & + +echo "The training log can be viewed at: $LOG_DIR/$CONFIG_NAME.log" +echo "Please wait several seconds to see it here." + +sleep 5 +tail -f $LOG_DIR/$CONFIG_NAME.log \ No newline at end of file From 7458407924862467a75f6128a39026ee7a4ec036 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 16:57:54 +0900 Subject: [PATCH 18/70] fix bug --- run/train/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run/train/train.sh b/run/train/train.sh index 91193d6..028bd10 100644 --- a/run/train/train.sh +++ b/run/train/train.sh @@ -10,12 +10,12 @@ CONFIG_PATH=$1 CONFIG_NAME=$(basename "$CONFIG_PATH") CONFIG_NAME=(${CONFIG_NAME//./ }) -CONFIG_DIR=$(dirname "$YAML_PATH") +CONFIG_DIR=$(dirname "$CONFIG_PATH") EXP_DIR=$(dirname "$CONFIG_DIR") LOG_DIR=$EXP_DIR/logs mkdir -p $LOG_DIR -nohup python -u main.py --yaml $YAML_PATH >> $LOG_DIR/$CONFIG_NAME.log & +nohup python -u train.py --yaml $CONFIG_PATH >> $LOG_DIR/$CONFIG_NAME.log & echo "The training log can be viewed at: $LOG_DIR/$CONFIG_NAME.log" echo "Please wait several seconds to see it here." From fb2a89821947c4f105ee217cc5042c7c45f4429d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 17:14:26 +0900 Subject: [PATCH 19/70] script name --- model/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/training.py b/model/training.py index 6414bf9..5046db0 100644 --- a/model/training.py +++ b/model/training.py @@ -4,7 +4,7 @@ import os import pickle -from eval.evaluation import eval +from eval.evaluate import eval from utils import utils from utils.utils import debug, path from utils.utils import ( From c12486d1549eb7e3c026d8d9158cd8cde88ea218 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 17:15:22 +0900 Subject: [PATCH 20/70] missing script --- scripts/pipeline_process.py | 185 ++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 scripts/pipeline_process.py diff --git a/scripts/pipeline_process.py b/scripts/pipeline_process.py new file mode 100644 index 0000000..dd62541 --- /dev/null +++ b/scripts/pipeline_process.py @@ -0,0 +1,185 @@ +from collections import defaultdict + +from utils.utils import write_annotation_file + + +def get_entity_attrs(e_span_indice, words, offsets, sub_to_words): + e_words = [] + e_offset = [-1, -1] + curr_word_idx = -1 + for idx in range(e_span_indice[0], e_span_indice[1] + 1): + if sub_to_words[idx] != curr_word_idx: + e_words.append(words[sub_to_words[idx]]) + curr_word_idx = sub_to_words[idx] + if idx == e_span_indice[0]: + e_offset[0] = offsets[sub_to_words[idx]][0] + if idx == e_span_indice[1]: + e_offset[1] = offsets[sub_to_words[idx]][1] + return ' '.join(e_words), (e_offset[0], e_offset[1]) + + +def gen_sw_offsets(word_offsets, words, subwords, sub_to_words): + sw_offsets = [] + last_sw_offsets = -1 + for sw_id, w_id in sub_to_words.items(): + subword = subwords[sw_id].replace('##', '') + word = words[w_id] + word_offset = word_offsets[w_id] + sw_idx = word.index(subword, + 0 if (last_sw_offsets == -1 or last_sw_offsets < word_offset[0]) else last_sw_offsets - 1 - + word_offset[0]) + sw_offsets.append((word_offset[0] + sw_idx, word_offset[0] + sw_idx + len(subword))) + last_sw_offsets = word_offset[0] + sw_idx + len(subword) + return sw_offsets + + +def get_entity_sw_attrs(e_id, e_span_indice, words, offsets, sub_to_words, subwords, sw_offsets, org_mapping): + e_words = [] + e_offset = [-1, -1] + sw_text = [] + sw_offset = [-1, -1] + + curr_word_idx = -1 + for idx in range(e_span_indice[0], e_span_indice[1] + 1): + if sub_to_words[idx] != curr_word_idx: + e_words.append(words[sub_to_words[idx]]) + curr_word_idx = sub_to_words[idx] + sw_text.append(subwords[idx]) + if idx == e_span_indice[0]: + e_offset[0] = offsets[sub_to_words[idx]][0] + sw_offset[0] = sw_offsets[idx][0] + if idx == e_span_indice[1]: + e_offset[1] = offsets[sub_to_words[idx]][1] + sw_offset[1] = sw_offsets[idx][1] + org_mapping[e_id] = (' '.join(e_words), (e_offset[0], e_offset[1])) + return ' '.join(sw_text), (sw_offset[0], sw_offset[1]) + + +def gen_ner_ann_files(fidss, ent_anns, params): + dir2wr = params['pipeline_setting'] + params['pipe_ner'] + + # Initial ent map + map = defaultdict() + org_map = defaultdict() + params['pipeline_text_data'] = defaultdict() + for fids in fidss: + for fid in fids: + map[fid] = {} + org_map[fid] = {} + params['pipeline_text_data'][fid] = [] + + for xi, (fids, ent_ann) in enumerate(zip(fidss, ent_anns)): + # Mapping entities + entity_map = defaultdict() + for xb, (fid) in enumerate(fids): + span_indices = ent_ann['span_indices'][xb] + ner_terms = ent_ann['ner_terms'][xb] + ner_preds = ent_ann['ner_preds'][xb] + words = ent_ann['words'][xb] + offsets = ent_ann['offsets'][xb] + sub_to_words = ent_ann['sub_to_words'][xb] + subwords = ent_ann['subwords'][xb] + sw_offsets = gen_sw_offsets(offsets, words, subwords, sub_to_words) + params['pipeline_text_data'][fid].append( + {'words': subwords, 'offsets': sw_offsets}) + entities = map[fid] + org_mapping = org_map[fid] + + for x, pair in enumerate(span_indices): + if pair[0].item() == -1: + break + if ner_preds[x] > 0: + try: + e_id = ner_terms.id2term[x] + e_type = params['mappings']['rev_type_map'][ + params['mappings']['nn_mapping']['tag2type_map'][ner_preds[x]]] + e_words, e_offset = get_entity_sw_attrs(e_id, pair, words, offsets, sub_to_words, subwords, + sw_offsets, org_mapping) + entity_map[(xb, (pair[0].item(), pair[1].item()))] = ( + ner_preds[x], e_id, e_type, e_words, e_offset) + entities[e_id] = {"id": e_id, "type": e_type, "start": e_offset[0], "end": e_offset[1], + "ref": e_words} + except KeyError as error: + print('pred not map term', error) + + params['pipeline_entity_org_map'] = org_map + + for fid, ners in map.items(): + write_annotation_file(ann_file=dir2wr + fid + '.ann', entities=ners) + + +def gen_rel_ann_files(fidss, ent_anns, rel_anns, params): + dir2wr = params['pipeline_setting'] + params['pipe_rel'] + + # Initial ent+rel map + map = defaultdict() + for fids in fidss: + for fid in fids: + map[fid] = {'ents': {}, 'rels': {}} + + for xi, (fids, ent_ann, rel_ann) in enumerate(zip(fidss, ent_anns, rel_anns)): + # Mapping entities + entity_map = defaultdict() + for xb, (fid) in enumerate(fids): + span_indices = ent_ann['span_indices'][xb] + ner_terms = ent_ann['ner_terms'][xb] + ner_preds = ent_ann['ner_preds'][xb] + words = ent_ann['words'][xb] + offsets = ent_ann['offsets'][xb] + sub_to_words = ent_ann['sub_to_words'][xb] + + entities = map[fid]['ents'] + + for x, pair in enumerate(span_indices): + if pair[0].item() == -1: + break + if ner_preds[x] > 0: + try: + e_id = ner_terms.id2term[x] + e_type = params['mappings']['rev_type_map'][ + params['mappings']['nn_mapping']['tag2type_map'][ner_preds[x]]] + e_words, e_offset = get_entity_attrs(pair, words, offsets, sub_to_words) + entity_map[(xb, (pair[0].item(), pair[1].item()))] = ( + ner_preds[x], e_id, e_type, e_words, e_offset) + entities[e_id] = {"id": e_id, "type": e_type, "start": e_offset[0], "end": e_offset[1], + "ref": e_words} + except KeyError as error: + print('pred not map term', error) + if len(rel_ann) > 0: + # Mapping relations + pairs_idx = rel_ann['pairs_idx'] + rel_preds = rel_ann['rel_preds'] + + pairs_idx_i = pairs_idx[0] + pairs_idx_j = pairs_idx[1] + pairs_idx_k = pairs_idx[2] + + for x, i in enumerate(pairs_idx_i): + relations = map[fids[i]]['rels'] + r_count = len(relations) + 1 + + j = pairs_idx_j[x] + k = pairs_idx_k[x] + rel = rel_preds[x].item() + role = params['mappings']['rev_rel_map'][rel].split(":")[1] + if role != 'Other': + arg1s = entity_map[ + (i.item(), (ent_ann['span_indices'][i][j][0].item(), ent_ann['span_indices'][i][j][1].item()))] + arg2s = entity_map[ + (i.item(), (ent_ann['span_indices'][i][k][0].item(), ent_ann['span_indices'][i][k][1].item()))] + + if int(params['mappings']['rev_rel_map'][rel].split(":")[0]) > int( + params['mappings']['rev_rel_map'][rel].split(":")[-1]): + arg1 = arg2s[1] + arg2 = arg1s[1] + else: + arg1 = arg1s[1] + arg2 = arg2s[1] + r_id = 'R' + str(r_count) + r_count += 1 + relations[r_id] = {"id": r_id, "role": role, + "left_arg": {"label": "Arg1", "id": arg1}, + "right_arg": {"label": "Arg2", "id": arg2}} + + for fid, ners_rels in map.items(): + write_annotation_file(ann_file=dir2wr + fid + '.ann', entities=ners_rels['ents'], relations=ners_rels['rels']) From 59808e3ab171c37b8947edb4a55d8af26e28408e Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 17:23:17 +0900 Subject: [PATCH 21/70] epochs for debug mode --- scripts/generate_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_configs.py b/scripts/generate_configs.py index 130b1b5..8698cd8 100644 --- a/scripts/generate_configs.py +++ b/scripts/generate_configs.py @@ -212,7 +212,7 @@ def set_debug_mode(configs, args): configs['test_data'] = configs['test_data'].replace('dev', "debug") configs['test_data'] = configs['test_data'].replace('test', "debug") if "epoch" in configs: - configs["epoch"] = 2 + configs["epoch"] = 10 def generate_configs(args, expdir, task, exp_name): """Generate configs for all.""" From 025690f5050b3ada10fbb83be98fff02e61bd446 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 23:35:34 +0900 Subject: [PATCH 22/70] scripts path --- README.md | 18 +++++++++--------- run/{train => }/download-bert.sh | 0 run/{train => }/generate-configs-debug.sh | 0 run/{train => }/generate-configs.sh | 0 run/{train => }/prepare-cg.sh | 0 run/{train => }/train.sh | 0 6 files changed, 9 insertions(+), 9 deletions(-) rename run/{train => }/download-bert.sh (100%) rename run/{train => }/generate-configs-debug.sh (100%) rename run/{train => }/generate-configs.sh (100%) rename run/{train => }/prepare-cg.sh (100%) rename run/{train => }/train.sh (100%) diff --git a/README.md b/README.md index 71eaf9a..4e01a83 100644 --- a/README.md +++ b/README.md @@ -73,44 +73,44 @@ python2 standalone.py - Download the processed event structures ```bash -sh run/train/prepare-cg.sh +sh run/prepare-cg.sh ``` 2. Download models - Download SciBERT model from PyTorch AllenNLP ```bash -sh run/train/download-bert.sh +sh run/download-bert.sh ``` 3. Generate configs - Configs for training CG task ```bash -sh run/train/generate_configs.sh cg basic +sh run/generate_configs.sh cg basic ``` - Experiment name: basic, exp1, exp2, etc - Or running this debug mode (on a small data with several epochs) ```bash -sh run/train/generate_configs-debug.sh cg debug +sh run/generate_configs-debug.sh cg debug ``` 4. Training - Pretrain layers (these need to be done before training the joint model) - Replace "basic" by "debug" to quickly try experiments on the small data (debug mode) ```bash -sh run/train/train.sh experiments/cg/basic/configs/train-ner.yaml -sh run/train/train.sh experiments/cg/basic/configs/train-rel.yaml -sh run/train/train.sh experiments/cg/basic/configs/train-ev.yaml +sh run/train.sh experiments/cg/basic/configs/train-ner.yaml +sh run/train.sh experiments/cg/basic/configs/train-rel.yaml +sh run/train.sh experiments/cg/basic/configs/train-ev.yaml ``` - Train joint model: given gold entity ```bash -sh run/train/train.sh experiments/cg/basic/configs/train-joint-gold.yaml +sh run/train.sh experiments/cg/basic/configs/train-joint-gold.yaml ``` - Train joint end-to-end model ```bash -sh run/train/train.sh experiments/cg/basic/configs/train-joint-e2e.yaml +sh run/train.sh experiments/cg/basic/configs/train-joint-e2e.yaml ``` # 3. Predict (BioNLP tasks) diff --git a/run/train/download-bert.sh b/run/download-bert.sh similarity index 100% rename from run/train/download-bert.sh rename to run/download-bert.sh diff --git a/run/train/generate-configs-debug.sh b/run/generate-configs-debug.sh similarity index 100% rename from run/train/generate-configs-debug.sh rename to run/generate-configs-debug.sh diff --git a/run/train/generate-configs.sh b/run/generate-configs.sh similarity index 100% rename from run/train/generate-configs.sh rename to run/generate-configs.sh diff --git a/run/train/prepare-cg.sh b/run/prepare-cg.sh similarity index 100% rename from run/train/prepare-cg.sh rename to run/prepare-cg.sh diff --git a/run/train/train.sh b/run/train.sh similarity index 100% rename from run/train/train.sh rename to run/train.sh From d2197b0c1417432cdf3ea93eeba31a637f7dda93 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 23:39:35 +0900 Subject: [PATCH 23/70] predict --- README.md | 13 +++++++++++++ predict.py | 2 +- run/predict.sh | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 run/predict.sh diff --git a/README.md b/README.md index 4e01a83..011f2e9 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,19 @@ sh run/train.sh experiments/cg/basic/configs/train-joint-gold.yaml sh run/train.sh experiments/cg/basic/configs/train-joint-e2e.yaml ``` +5. Predict +- Given gold entity +```bash +sh run/predict.sh experiments/cg/basic/configs/predict-gold-dev.yaml +sh run/predict.sh experiments/cg/basic/configs/predict-gold-test.yaml +``` + +- End-to-end +```bash +sh run/predict.sh experiments/cg/basic/configs/predict-e2e-dev.yaml +sh run/predict.sh experiments/cg/basic/configs/predict-e2e-test.yaml +``` + # 3. Predict (BioNLP tasks) ## 3.1. Prepare data diff --git a/predict.py b/predict.py index fc67343..63aad3e 100644 --- a/predict.py +++ b/predict.py @@ -7,7 +7,7 @@ from eval.evaluate import predict -from nets import deepEM +from model import deepEM from loader.prepData import prepdata from loader.prepNN import prep4nn from utils import utils diff --git a/run/predict.sh b/run/predict.sh new file mode 100644 index 0000000..dfe7e22 --- /dev/null +++ b/run/predict.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +ROOT=$PWD + +export PYTHONPATH="${PYTHONPATH}:$ROOT" +export PYTHONPATH="${PYTHONPATH}:$ROOT/model" + +CONFIG_PATH=$1 + +CONFIG_NAME=$(basename "$CONFIG_PATH") +CONFIG_NAME=(${CONFIG_NAME//./ }) + +CONFIG_DIR=$(dirname "$CONFIG_PATH") +EXP_DIR=$(dirname "$CONFIG_DIR") +LOG_DIR=$EXP_DIR/logs +mkdir -p $LOG_DIR + +nohup python -u predict.py --yaml $CONFIG_PATH >> $LOG_DIR/$CONFIG_NAME.log & + +echo "The training log can be viewed at: $LOG_DIR/$CONFIG_NAME.log" +echo "Please wait several seconds to see it here." + +sleep 5 +tail -f $LOG_DIR/$CONFIG_NAME.log \ No newline at end of file From cc7eaa47af5822e298019a875faf84eb354aee88 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 23:44:09 +0900 Subject: [PATCH 24/70] prediction script --- predict.py | 220 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 178 insertions(+), 42 deletions(-) diff --git a/predict.py b/predict.py index 63aad3e..d237630 100644 --- a/predict.py +++ b/predict.py @@ -1,12 +1,13 @@ import os import random import pickle +import time + import numpy as np import torch from torch.utils.data import TensorDataset, DataLoader, SequentialSampler -from eval.evaluate import predict - +from eval.evaluate import eval from model import deepEM from loader.prepData import prepdata from loader.prepNN import prep4nn @@ -14,94 +15,220 @@ def main(): - # read predict config + # check running time + t_start = time.time() + # set config path by command line inp_args = utils._parsing() config_path = getattr(inp_args, 'yaml') # set config path manually - # config_path = 'configs/debug.yaml' + # config_path = 'configs/default.yaml' with open(config_path, 'r') as stream: - pred_params = utils._ordered_load(stream) + parameters = utils._ordered_load(stream) + + # print config + utils._print_config(parameters, config_path) + + pred_params = parameters # Fix seed for reproducibility - os.environ["PYTHONHASHSEED"] = str(pred_params['seed']) - random.seed(pred_params['seed']) - np.random.seed(pred_params['seed']) - torch.manual_seed(pred_params['seed']) + os.environ["PYTHONHASHSEED"] = str(parameters['seed']) + random.seed(parameters['seed']) + np.random.seed(parameters['seed']) + torch.manual_seed(parameters['seed']) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + # Load configurations for prediction only + test_data_dir = parameters['test_data'] + params_dir = parameters['params'] + pipelines = parameters['pipelines'] + t_gpu = parameters['t_gpu'] + t_fp16 = parameters['t_fp16'] + t_batch_size = parameters['t_batch_size'] + + rel_eval_script_path = parameters['rel_eval_script_path'] + + ev_eval_script_path = parameters['ev_eval_script_path'] + + gold_eval = parameters['gold_eval'] + + bert_model = parameters['bert_model'] + # Load pre-trained parameters - with open(pred_params['saved_params'], "rb") as f: + with open(params_dir, "rb") as f: parameters = pickle.load(f) parameters['predict'] = True - # Set predict settings value for params - parameters['gpu'] = pred_params['gpu'] - parameters['batchsize'] = pred_params['batchsize'] + parameters['gpu'] = t_gpu + parameters['fp16'] = t_fp16 + parameters['batchsize'] = t_batch_size if parameters['gpu'] >= 0: device = torch.device("cuda:" + str(parameters['gpu']) if torch.cuda.is_available() else "cpu") + torch.cuda.set_device(parameters['gpu']) else: device = torch.device("cpu") parameters['device'] = device # Set evaluation settings - parameters['test_data'] = pred_params['test_data'] + parameters['test_data'] = test_data_dir + parameters['rel_eval_script_path'] = rel_eval_script_path + parameters['ev_eval_script_path'] = ev_eval_script_path - parameters['bert_model'] = pred_params['bert_model'] + parameters['gold_eval'] = gold_eval + parameters['bert_model'] = bert_model + parameters['pipelines'] = pipelines result_dir = pred_params['result_dir'] if not os.path.exists(result_dir): os.makedirs(result_dir) + parameters['pipeline_setting'] = result_dir + parameters['pipe_ner'] = 'pipe_ner/' + parameters['pipe_rel'] = 'pipe_rel/' + parameters['pipe_ev'] = 'pipe_ev/' + pipe_ner = parameters['pipeline_setting'] + parameters['pipe_ner'] + pipe_rel = parameters['pipeline_setting'] + parameters['pipe_rel'] + preprocess_pipe_dir(parameters['test_data'], pipe_ner) + preprocess_pipe_dir(parameters['test_data'], pipe_rel) parameters['result_dir'] = pred_params['result_dir'] - # raw text - parameters['raw_text'] = pred_params['raw_text'] - parameters['ner_predict_all'] = pred_params['raw_text'] - parameters['a2_entities'] = pred_params['a2_entities'] - - # process data - test_data = prepdata.prep_input_data(pred_params['test_data'], parameters) - nntest_data, test_dataloader = read_test_data(test_data, parameters) + parameters['ner_predict_all'] = pred_params['ner_predict_all'] - # model - deepee_model = deepEM.DeepEM(parameters) + if pipelines: - model_path = pred_params['model_path'] + ner_model_dir = pred_params['ner_model_dir'] + rel_model_dir = pred_params['rel_model_dir'] + ev_model_dir = pred_params['ev_model_dir'] - # Load all models - utils.handle_checkpoints(model=deepee_model, - checkpoint_dir=model_path, - params={ - 'device': device - }, - resume=True) + model_dir = {'NER': ner_model_dir, 'REL': rel_model_dir, 'EV': ev_model_dir} + data_dir = {'NER': test_data_dir, 'REL': pipe_ner, 'EV': pipe_rel} + # 1.Run NER model + print("Start running NER model") + run_pipeline('NER', model_dir, data_dir, parameters, device) + # 2.Run REL model + print("Start running REL model") + run_pipeline('REL', model_dir, data_dir, parameters, device) + # 3. Run EV model (final) + print("Start running EV model") + run_pipeline('EV', model_dir, data_dir, parameters, device) + else: + # 1. process data + test_data = prepdata.prep_input_data(test_data_dir, parameters) + test_data, test_dataloader = read_test_data(test_data, parameters) + + # 2. model + # Init zero model + deepee_model = deepEM.DeepEM(parameters) + + deepee_model_dir = pred_params['joint_model_dir'] + + # Load all models + utils.handle_checkpoints(model=deepee_model, + checkpoint_dir=deepee_model_dir, + params={ + 'device': device + }, + resume=True) + + deepee_model.to(device) + + # create output directory for results + result_dir = parameters['result_dir'] + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + eval(model=deepee_model, + eval_dir=parameters['test_data'], + result_dir=result_dir, + eval_dataloader=test_dataloader, + eval_data=test_data, + params=parameters) + + print('PREDICT: DONE!') + + # calculate running time + t_end = time.time() + print('TOTAL RUNNING TIME: {}'.format(utils._humanized_time(t_end - t_start))) + + +def run_pipeline(model_type, model_dir, test_data_dir, params, device): + if model_type == 'NER': + deepee_model = deepEM.DeepEM(params) + utils.handle_checkpoints(model=deepee_model.NER_layer, + checkpoint_dir=model_dir[model_type] + 'ner_model/', + params={ + 'device': device + }, + resume=True) + params['pipe_flag'] = 0 + elif model_type == 'REL': + deepee_model = deepEM.DeepEM(params) + utils.handle_checkpoints(model=deepee_model.REL_layer, + checkpoint_dir=model_dir[model_type] + 'rel_model/', + params={ + 'device': device + }, + resume=True) + utils.handle_checkpoints(model=deepee_model.NER_layer, + checkpoint_dir=model_dir[model_type] + 'ner_model/', + params={ + 'device': device + }, + resume=True) + params['pipe_flag'] = 1 + elif model_type == 'EV': + deepee_model = deepEM.DeepEM(params) + utils.handle_checkpoints(model=deepee_model.EV_layer, + checkpoint_dir=model_dir[model_type] + 'ev_model/', + params={ + 'device': device + }, + resume=True) + utils.handle_checkpoints(model=deepee_model.REL_layer, + checkpoint_dir=model_dir[model_type] + 'rel_model/', + params={ + 'device': device + }, + resume=True) + utils.handle_checkpoints(model=deepee_model.NER_layer, + checkpoint_dir=model_dir[model_type] + 'ner_model/', + params={ + 'device': device + }, + resume=True) + params['pipe_flag'] = 2 + test_data = prepdata.prep_input_data(test_data_dir[model_type], params) + test_data, test_dataloader = read_test_data(test_data, params) deepee_model.to(device) - predict(model=deepee_model, - result_dir=result_dir, - eval_dataloader=test_dataloader, - eval_data=nntest_data, - g_entity_ids_=test_data['g_entity_ids_'], - params=parameters) + # create output directory for results + result_dir = 'results/' + params['experiment_name'] + '/' + if not os.path.exists(result_dir): + os.makedirs(result_dir) - # print('Done!') + eval(model=deepee_model, + eval_dir=test_data_dir['NER'], + result_dir=result_dir, + eval_dataloader=test_dataloader, + eval_data=test_data, + params=params) def read_test_data(test_data, params): - test = prep4nn.data2network(test_data, 'predict', params) + test, test_events_map = prep4nn.data2network(test_data, 'predict', params) if len(test) == 0: raise ValueError("Test set empty.") - test_data = prep4nn.torch_data_2_network(cdata2network=test, params=params, do_get_nn_data=True) + test_data = prep4nn.torch_data_2_network(cdata2network=test, events_map=test_events_map, params=params, + do_get_nn_data=True) te_data_size = len(test_data['nn_data']['ids']) test_data_ids = TensorDataset(torch.arange(te_data_size)) @@ -110,5 +237,14 @@ def read_test_data(test_data, params): return test_data, test_dataloader +def preprocess_pipe_dir(test_dir, pipe_dir): + if not os.path.exists(pipe_dir): + os.makedirs(pipe_dir) + else: + os.system('rm ' + pipe_dir + '*') + command = 'cp ' + test_dir + '*.txt ' + pipe_dir + os.system(command) + + if __name__ == '__main__': main() From ff8f6db0349c75ba6e1fed6f5d889613aa0bd4c9 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 23:56:51 +0900 Subject: [PATCH 25/70] readme --- README.md | 84 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 011f2e9..0bc014c 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,19 @@ # DeepEventMine + A deep leanring model to predict named entities, triggers, and nested events from biomedical texts. - The model and results are reported in our paper: -[DeepEventMine: End-to-end Neural Nested Event Extraction from Biomedical Texts](https://doi.org/10.1093/bioinformatics/btaa540), Bioinformatics, 2020. +[DeepEventMine: End-to-end Neural Nested Event Extraction from Biomedical Texts](https://doi.org/10.1093/bioinformatics/btaa540) +, Bioinformatics, 2020. ## Overview + 1. Features -- Based on [pre-trained BERT](https://github.com/allenai/scibert) -- Predict nested entities and nested events -- Provide our trained models on the seven biomedical tasks + +- End-to-end event extraction, fine-tuned on [pre-trained BERT](https://github.com/allenai/scibert) +- Train and predict nested entities and nested events +- Provide our pre-trained models on seven biomedical tasks - Reproduce the results reported in our [Bioinformatics](https://doi.org/10.1093/bioinformatics/btaa540) paper - Predict for new data given raw text input or PubMed ID - Visualize the predicted entities and events on the [brat](http://brat.nlplab.org) @@ -22,11 +26,13 @@ A deep leanring model to predict named entities, triggers, and nested events fro 2. ge11: [GENIA Event Extraction (GENIA), 2011](http://2011.bionlp-st.org/home/genia-event-extraction-genia) 3. ge13: [GENIA Event Extraction (GENIA), 2013](http://bionlp.dbcls.jp/projects/bionlp-st-ge-2013/wiki/Overview) 4. id: [Infectious Diseases (ID), 2011](http://2011.bionlp-st.org/home/infectious-diseases) -5. epi: [Epigenetics and Post-translational Modifications (EPI), 2011](http://2011.bionlp-st.org/home/epigenetics-and-post-translational-modifications) +5. +epi: [Epigenetics and Post-translational Modifications (EPI), 2011](http://2011.bionlp-st.org/home/epigenetics-and-post-translational-modifications) 6. pc: [Pathway Curation (PC), 2013](http://2013.bionlp-st.org/tasks/pathway-curation) 7. mlee: [Multi-Level Event Extraction (MLEE)](http://nactem.ac.uk/MLEE/) # 1. Preparation + 1. Install conda environment ```bash @@ -48,6 +54,7 @@ sh setup/conda-install.sh ``` 4. Install requirements + - Python 3.6.5 - PyTorch (torch==1.1.0 torchvision==0.3.0, cuda92) - Python dependencies @@ -57,6 +64,7 @@ pip install -r requirements.txt ``` 5. [Brat](https://github.com/nlplab/brat) for visualization + - brat instructions](http://brat.nlplab.org/installation.html) ```bash @@ -65,37 +73,50 @@ python2 standalone.py ``` # 2. Training CG + 1. Download data and process + - Download data - Process data to appropriate format - Tokenize texts and retrieve offsets - Data statistics - Download the processed event structures +- The original BioNLP 2013 (for downloading CG data) seems unavailable recently. We found an alternative link + for [CG13 task](https://sites.google.com/site/bionlpst2013/tasks/cancer-genetics-cg-task). You may download the data + by yourself (we are not sure the data is the same as the original link, so please check by yourself). ```bash sh run/prepare-cg.sh ``` 2. Download models + - Download SciBERT model from PyTorch AllenNLP + ```bash sh run/download-bert.sh ``` 3. Generate configs + - Configs for training CG task + ```bash sh run/generate_configs.sh cg basic ``` + - Experiment name: basic, exp1, exp2, etc - Or running this debug mode (on a small data with several epochs) + ```bash sh run/generate_configs-debug.sh cg debug ``` 4. Training + - Pretrain layers (these need to be done before training the joint model) - Replace "basic" by "debug" to quickly try experiments on the small data (debug mode) + ```bash sh run/train.sh experiments/cg/basic/configs/train-ner.yaml sh run/train.sh experiments/cg/basic/configs/train-rel.yaml @@ -103,6 +124,7 @@ sh run/train.sh experiments/cg/basic/configs/train-ev.yaml ``` - Train joint model: given gold entity + ```bash sh run/train.sh experiments/cg/basic/configs/train-joint-gold.yaml ``` @@ -114,13 +136,16 @@ sh run/train.sh experiments/cg/basic/configs/train-joint-e2e.yaml ``` 5. Predict + - Given gold entity + ```bash sh run/predict.sh experiments/cg/basic/configs/predict-gold-dev.yaml sh run/predict.sh experiments/cg/basic/configs/predict-gold-test.yaml ``` - End-to-end + ```bash sh run/predict.sh experiments/cg/basic/configs/predict-e2e-dev.yaml sh run/predict.sh experiments/cg/basic/configs/predict-e2e-test.yaml @@ -129,7 +154,9 @@ sh run/predict.sh experiments/cg/basic/configs/predict-e2e-test.yaml # 3. Predict (BioNLP tasks) ## 3.1. Prepare data + 1. Download corpora + - To download the original data sets from BioNLP shared tasks. - [task] = cg, pc, ge11, etc @@ -138,6 +165,7 @@ sh download.sh bionlp [task] ``` 2. Download our pre-trained DeepEventMine model on a given task + - [Our trained models](https://b2share.eudat.eu/records/80d2de0c57d64419b722dc1afa375f28) - [Our scores](https://b2share.eudat.eu/api/files/3cf6c1f4-5eed-4ee3-99c5-d99f5f011be3/scores.tar.gz) - [task] = cg (or pc, ge11, epi, etc) @@ -147,14 +175,18 @@ sh download.sh deepeventmine [task] ``` 3. Preprocess data + - Tokenize texts and prepare data for prediction + ```bash sh preprocess.sh bionlp ``` 4. Generate configs + - If using GPU: [gpu] = 0, otherwise: [gpu] = -1 - [task] = cg, pc, etc + ```bash sh run.sh config [task] [gpu] ``` @@ -162,6 +194,7 @@ sh run.sh config [task] [gpu] ## 3.2. Predict 1. For development and test sets (given gold entities) + - CG task: [task] = cg - PC task: [task] = pc - Similarly for: ge11, ge13, epi, id, mlee @@ -170,7 +203,9 @@ sh run.sh config [task] [gpu] sh run.sh predict [task] gold dev sh run.sh predict [task] gold test ``` + - Check the output in the path + ```bash experiments/[task]/predict-gold-dev/ experiments/[task]/predict-gold-test/ @@ -179,6 +214,7 @@ experiments/[task]/predict-gold-test/ ## 3.3. Evaluate 1. Retrieve the original offsets and create zip format + ```bash sh run.sh offset [task] gold dev sh run.sh offset [task] gold test @@ -187,10 +223,14 @@ sh run.sh offset [task] gold test 2. Submit the zipped file to the shared task evaluation sites: - [CG Test](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST-2013/CG/submission/) -- [GE11 Test](http://bionlp-st.dbcls.jp/GE/2011/eval-test/), [GE11 Devel](http://bionlp-st.dbcls.jp/GE/2011/eval-development/) -- [GE13 Test](http://bionlp-st.dbcls.jp/GE/2013/eval-test/), [GE13 Devel](http://bionlp-st.dbcls.jp/GE/2013/eval-development/) -- [ID Test](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/ID/test-eval.html), [ID Devel](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/ID/devel-eval.htm) -- [EPI Test](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/EPI/test-eval.html), [EPI Devel](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/EPI/devel-eval.htm) +- [GE11 Test](http://bionlp-st.dbcls.jp/GE/2011/eval-test/) + , [GE11 Devel](http://bionlp-st.dbcls.jp/GE/2011/eval-development/) +- [GE13 Test](http://bionlp-st.dbcls.jp/GE/2013/eval-test/) + , [GE13 Devel](http://bionlp-st.dbcls.jp/GE/2013/eval-development/) +- [ID Test](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/ID/test-eval.html) + , [ID Devel](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/ID/devel-eval.htm) +- [EPI Test](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/EPI/test-eval.html) + , [EPI Devel](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST/EPI/devel-eval.htm) - [PC Test](http://weaver.nlplab.org/~bionlp-st/BioNLP-ST-2013/PC/submission/) 3. Evaluate events @@ -205,18 +245,23 @@ sh run.sh eval [task] gold dev sp # 4. End-to-end ## 4.1. Input: a single PMID or PMCID + - Abstract + ```bash sh pubmed.sh e2e pmid 1370299 cg 0 ``` - Full text + ```bash sh pubmed.sh e2e pmcid PMC4353630 cg 0 ``` -- Input: [PMID: 1370299](https://pubmed.ncbi.nlm.nih.gov/1370299/), [PMCID: PMC4353630](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4353630/) (a single PubMed ID to get raw text) -- Model to predict: DeepEventMine trained on [cg (Cancer Genetics 2013)](http://2013.bionlp-st.org/tasks/cancer-genetics), (other options: pc, ge11, etc) +- Input: [PMID: 1370299](https://pubmed.ncbi.nlm.nih.gov/1370299/) + , [PMCID: PMC4353630](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4353630/) (a single PubMed ID to get raw text) +- Model to predict: DeepEventMine trained + on [cg (Cancer Genetics 2013)](http://2013.bionlp-st.org/tasks/cancer-genetics), (other options: pc, ge11, etc) - GPU: 0 (if CPU: -1) - Output: in brat format and [brat visualization](http://brat.nlplab.org) @@ -250,6 +295,7 @@ E24 Positive_regulation:T61 Theme:E10 - Given an arbitrary name for your raw text data, for example "my-pubmed" - Prepare a list of PMID and PMCID in the path + ```bash data/my-pubmed/pmid.txt ``` @@ -262,6 +308,7 @@ sh pubmed.sh e2e pmids my-pubmed cg 0 - Given an arbitrary name for your raw text data, for example "my-pubmed" - Prepare your raw text files in the path + ```bash data/my-pubmed/text/PMID-*.txt data/my-pubmed/text/PMC-*.txt @@ -293,6 +340,7 @@ data/my-pubmed/text/PMC-*.txt ### Get raw text 1. PubMed ID list + - In order to get full text given PMC ID, the text should be available in ePub (for our current version). - Prepare your list of PubMed ID and PMC ID in the path @@ -301,12 +349,15 @@ data/my-pubmed/pmid.txt ``` - Get text from the PubMed ID + ```bash sh pubmed.sh pmids my-pubmed ``` 2. PubMed ID + - You can also get text by directly input a PubMed or PMC ID + ```bash sh pubmed.sh pmid 1370299 sh pubmed.sh pmcid PMC4353630 @@ -321,6 +372,7 @@ sh pubmed.sh preprocess my-pubmed ## 5.3. Predict 1. Generate config + - Generate config for prediction - The data name to predict: my-pubmed - The trained model used for predict: cg (or pc, ge11, etc) @@ -343,6 +395,7 @@ sh pubmed.sh offset my-pubmed ``` - Check the output in + ```bash experiments/my-pubmed/results/ev-last/my-pubmed-brat ``` @@ -353,11 +406,13 @@ experiments/my-pubmed/results/ev-last/my-pubmed-brat - Copy the predicted data into the brat folder to visualize - For the raw text prediction: + ```bash sh pubmed.sh brat my-pubmed cg ``` - Or for the shared task + ```bash sh run.sh brat [task] gold dev sh run.sh brat [task] gold test @@ -373,10 +428,13 @@ brat/brat-v1.3_Crunchy_Frog/data/[task]-brat ``` # 7. Acknowledgements -This work is based on results obtained from a project commissioned by the New Energy and Industrial Technology Development Organization (NEDO). -This work is also supported by PRISM (Public/Private R&D Investment Strategic Expansion PrograM). + +This work is based on results obtained from a project commissioned by the New Energy and Industrial Technology +Development Organization (NEDO). This work is also supported by PRISM (Public/Private R&D Investment Strategic Expansion +PrograM). # 8. Citation + ```bash @article{10.1093/bioinformatics/btaa540, author = {Trieu, Hai-Long and Tran, Thy Thy and Duong, Khoa N A and Nguyen, Anh and Miwa, Makoto and Ananiadou, Sophia}, From d18e2d725f53a4142b04eb09a2e5bf83635d77b4 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Mon, 21 Mar 2022 23:59:03 +0900 Subject: [PATCH 26/70] readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0bc014c..85296a6 100644 --- a/README.md +++ b/README.md @@ -81,9 +81,9 @@ python2 standalone.py - Tokenize texts and retrieve offsets - Data statistics - Download the processed event structures -- The original BioNLP 2013 (for downloading CG data) seems unavailable recently. We found an alternative link +- The [original BioNLP 2013](http://2013.bionlp-st.org/tasks/cancer-genetics) (for downloading CG data) seems unavailable recently. We found an alternative link for [CG13 task](https://sites.google.com/site/bionlpst2013/tasks/cancer-genetics-cg-task). You may download the data - by yourself (we are not sure the data is the same as the original link, so please check by yourself). + by yourself. We are not sure the data is the same as the original link, so please check by yourself or contact the workshop's organizers. ```bash sh run/prepare-cg.sh From 22fac688423af9947762443fee5189f3fdc6b9ad Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:07:59 +0900 Subject: [PATCH 27/70] prediction configs --- pubmed.sh | 4 +- run.sh | 2 +- scripts/generate_pred_configs.py | 168 +++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 3 deletions(-) create mode 100644 scripts/generate_pred_configs.py diff --git a/pubmed.sh b/pubmed.sh index ae22dbd..7a42ea9 100644 --- a/pubmed.sh +++ b/pubmed.sh @@ -52,7 +52,7 @@ elif [ "$TASK" = "config" ]; then GPU=$4 EXP_DIR="experiments/" - python scripts/generate_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU + python scripts/generate_pred_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU # predict elif [ "$TASK" = "predict" ]; then @@ -147,7 +147,7 @@ elif [ "$TASK" = "e2e" ]; then GPU=$5 EXP_DIR="experiments/" - python scripts/generate_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU + python scripts/generate_pred_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU echo "--------------------------------" echo "4. Predict: " diff --git a/run.sh b/run.sh index 13b19e2..8be193f 100644 --- a/run.sh +++ b/run.sh @@ -12,7 +12,7 @@ if [ "$TASK" = "config" ]; then GPU=$3 - python scripts/generate_configs.py $TASK_DIR $CORPUS_NAME $GPU + python scripts/generate_pred_configs.py $TASK_DIR $CORPUS_NAME $GPU # predict elif [ "$TASK" = "predict" ]; then diff --git a/scripts/generate_pred_configs.py b/scripts/generate_pred_configs.py new file mode 100644 index 0000000..4e07162 --- /dev/null +++ b/scripts/generate_pred_configs.py @@ -0,0 +1,168 @@ +"""Generating configs for training and evaluating models.""" + +import os +import sys + +sys.path.insert(0, '.') +from utils import utils + + +def write_config(datapath, config): + """Write config to file""" + + with open(datapath, 'w') as outfile: + for key, value in config.items(): + + # format + if key == 'bert_model' or key == 'test_data' or key == 'ev_eval_script_path' or key == 'result_dir' or key == 'gpu': + outfile.write('\n') + + outfile.write('{}: {}'.format(key, value)) + outfile.write('\n') + + +def gen_predict_config(predict_config, specific_config, eval_set, config_dir, model_name, taskdir): + """For joint prediction""" + + # dev and test sets + if eval_set == 'dev' or eval_set == 'test': + predict_config['test_data'] = ''.join(["data/corpora/", model_name, "/", eval_set, "/"]) + predict_config['result_dir'] = ''.join([taskdir, '/predict-gold-', eval_set, '/']) + + # overwrite task config + overwrite_task_config(predict_config, specific_config) + + write_config(os.path.join(config_dir, ''.join(['predict-gold-', eval_set, '.yaml'])), predict_config) + + # for raw texts + elif eval_set == 'raw-text': + predict_config['test_data'] = ''.join(["data/processed-raw-text/", model_name, "/"]) + predict_config['result_dir'] = ''.join([taskdir, '/predict-', eval_set, '/']) + predict_config['raw_text'] = True + predict_config['ner_predict_all'] = True + + # overwrite task config + overwrite_task_config(predict_config, specific_config) + + write_config(os.path.join(config_dir, ''.join(['predict-', eval_set, '.yaml'])), predict_config) + + +def gen_predict_config_pubmed(predict_config, specific_config, config_dir, expdir, dataname): + predict_config['test_data'] = ''.join(["data/", dataname, "/processed-text/", "text/"]) + predict_config['result_dir'] = ''.join([expdir, dataname, '/results/']) + predict_config['raw_text'] = True + predict_config['ner_predict_all'] = True + + # overwrite task config + overwrite_task_config(predict_config, specific_config) + write_config(os.path.join(config_dir, ''.join(['predict-', dataname, '.yaml'])), predict_config) + + +def overwrite_task_config(config, specific_config): + """Overwrite config for specific task.""" + + # add specific task config + for key, value in specific_config.items(): + if key in config: + config[key] = value + + return config + + +def read_specific_config(task): + """Specific config for specific task.""" + + # open specific config + task_config_path = ''.join(['configs/', task, '.yaml']) + + specific_config = {} + + # check exist and read config + if os.path.exists(task_config_path): + with open(task_config_path, 'r') as stream: + specific_config = utils._ordered_load(stream) + + return specific_config + + +def generate_configs(taskdir, task, gpu): + """Generate configs for all.""" + + # create experiment dir + config_dir = os.path.join(taskdir, 'configs') + utils.makedir(config_dir) + + # default setting + default_config_path = 'configs/default.yaml' + with open(default_config_path, 'r') as stream: + default_config = utils._ordered_load(stream) + + # read config for specific task + specific_config = read_specific_config(task) + + # generate config for each task + task_config = default_config.copy() + task_config['gpu'] = gpu + task_config['task_name'] = task_config['task_name'].replace('cg', task) + task_config['model_path'] = task_config['model_path'].replace('cg', task) + task_config['saved_params'] = task_config['saved_params'].replace('cg', task) + task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', task) + + # predict config + predict_dev_config = task_config.copy() + gen_predict_config(predict_dev_config, specific_config, 'dev', config_dir, task, taskdir) + + predict_test_config = task_config.copy() + gen_predict_config(predict_test_config, specific_config, 'test', config_dir, task, taskdir) + + # for raw text + predict_test_config = task_config.copy() + gen_predict_config(predict_test_config, specific_config, 'raw-text', config_dir, task, taskdir) + + print('Generate configs: Done!') + + return + + +def generate_configs_pubmed(expdir, dataname, model_name, gpu): + """Generate configs for all.""" + + # create experiment dir + config_dir = os.path.join(expdir, ''.join([dataname, '/configs'])) + utils.makedir(config_dir) + + # default setting + default_config_path = 'configs/default.yaml' + with open(default_config_path, 'r') as stream: + default_config = utils._ordered_load(stream) + + # read config for specific task + specific_config = read_specific_config(model_name) + + # generate config for each task + task_config = default_config.copy() + task_config['gpu'] = gpu + task_config['task_name'] = task_config['task_name'].replace('cg', model_name) + task_config['model_path'] = task_config['model_path'].replace('cg', model_name) + task_config['saved_params'] = task_config['saved_params'].replace('cg', model_name) + task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', model_name) + + # for raw text + predict_test_config = task_config.copy() + gen_predict_config_pubmed(predict_test_config, specific_config, config_dir, expdir, dataname) + + print('Generate configs: Done!') + + return + + +if __name__ == '__main__': + # generate_configs_pubmed("experiments/", "cg", "my-pubmed", 0) + + # bionlp + if len(sys.argv) == 4: + generate_configs(sys.argv[1], sys.argv[2], sys.argv[3]) + + # pubmed + elif len(sys.argv) == 5: + generate_configs_pubmed(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) From 0103c80a09ed25eb3b03a5a45e4598cba63eb7fd Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:12:19 +0900 Subject: [PATCH 28/70] bionlp prediction configs --- configs/bio-default.yaml | 25 +++++++++++++++++++ pubmed.sh | 4 +-- run.sh | 2 +- ...red_configs.py => generate_bio_configs.py} | 4 +-- 4 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 configs/bio-default.yaml rename scripts/{generate_pred_configs.py => generate_bio_configs.py} (98%) diff --git a/configs/bio-default.yaml b/configs/bio-default.yaml new file mode 100644 index 0000000..67c56ca --- /dev/null +++ b/configs/bio-default.yaml @@ -0,0 +1,25 @@ +# Configuration file. + +task_name: cg + +# model +bert_model: data/bert/scibert_scivocab_cased +model_path: data/models/cg/model/ +saved_params: data/models/cg/cg.param + +# data +test_data: .. + +# eval +ev_eval_script_path: eval/scripts/eval-ev-cg.py +a2_entities: [] +raw_text: False +ner_predict_all: False + +# output +result_dir: .. + +# params +gpu: -1 +batchsize: 16 +seed: 42 \ No newline at end of file diff --git a/pubmed.sh b/pubmed.sh index 7a42ea9..569019d 100644 --- a/pubmed.sh +++ b/pubmed.sh @@ -52,7 +52,7 @@ elif [ "$TASK" = "config" ]; then GPU=$4 EXP_DIR="experiments/" - python scripts/generate_pred_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU + python scripts/generate_bio_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU # predict elif [ "$TASK" = "predict" ]; then @@ -147,7 +147,7 @@ elif [ "$TASK" = "e2e" ]; then GPU=$5 EXP_DIR="experiments/" - python scripts/generate_pred_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU + python scripts/generate_bio_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU echo "--------------------------------" echo "4. Predict: " diff --git a/run.sh b/run.sh index 8be193f..c089a95 100644 --- a/run.sh +++ b/run.sh @@ -12,7 +12,7 @@ if [ "$TASK" = "config" ]; then GPU=$3 - python scripts/generate_pred_configs.py $TASK_DIR $CORPUS_NAME $GPU + python scripts/generate_bio_configs.py $TASK_DIR $CORPUS_NAME $GPU # predict elif [ "$TASK" = "predict" ]; then diff --git a/scripts/generate_pred_configs.py b/scripts/generate_bio_configs.py similarity index 98% rename from scripts/generate_pred_configs.py rename to scripts/generate_bio_configs.py index 4e07162..7944174 100644 --- a/scripts/generate_pred_configs.py +++ b/scripts/generate_bio_configs.py @@ -93,7 +93,7 @@ def generate_configs(taskdir, task, gpu): utils.makedir(config_dir) # default setting - default_config_path = 'configs/default.yaml' + default_config_path = 'configs/bio-default.yaml' with open(default_config_path, 'r') as stream: default_config = utils._ordered_load(stream) @@ -132,7 +132,7 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): utils.makedir(config_dir) # default setting - default_config_path = 'configs/default.yaml' + default_config_path = 'configs/bio-default.yaml' with open(default_config_path, 'r') as stream: default_config = utils._ordered_load(stream) From ba13b99f9d8a8c0bbc0c35685a12cb80008bbfa8 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:17:25 +0900 Subject: [PATCH 29/70] prediction path --- scripts/generate_bio_configs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index 7944174..e4153dd 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -27,7 +27,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo # dev and test sets if eval_set == 'dev' or eval_set == 'test': predict_config['test_data'] = ''.join(["data/corpora/", model_name, "/", eval_set, "/"]) - predict_config['result_dir'] = ''.join([taskdir, '/predict-gold-', eval_set, '/']) + predict_config['result_dir'] = ''.join([taskdir, 'deepem-bionlp', '/predict-gold-', eval_set, '/']) # overwrite task config overwrite_task_config(predict_config, specific_config) @@ -37,7 +37,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo # for raw texts elif eval_set == 'raw-text': predict_config['test_data'] = ''.join(["data/processed-raw-text/", model_name, "/"]) - predict_config['result_dir'] = ''.join([taskdir, '/predict-', eval_set, '/']) + predict_config['result_dir'] = ''.join([taskdir, 'deepem-bionlp', '/predict-', eval_set, '/']) predict_config['raw_text'] = True predict_config['ner_predict_all'] = True @@ -49,7 +49,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo def gen_predict_config_pubmed(predict_config, specific_config, config_dir, expdir, dataname): predict_config['test_data'] = ''.join(["data/", dataname, "/processed-text/", "text/"]) - predict_config['result_dir'] = ''.join([expdir, dataname, '/results/']) + predict_config['result_dir'] = ''.join([expdir, dataname, 'deepem-bionlp', '/results/']) predict_config['raw_text'] = True predict_config['ner_predict_all'] = True @@ -128,7 +128,7 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(expdir, ''.join([dataname, '/configs'])) + config_dir = os.path.join(expdir, ''.join([dataname, 'deepem-bionlp', '/configs'])) utils.makedir(config_dir) # default setting From 3f98bcfdbab4dd83f4bb98acd9fd43478959786e Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:18:48 +0900 Subject: [PATCH 30/70] prediction path --- scripts/generate_bio_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index e4153dd..2ed280e 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -128,7 +128,7 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(expdir, ''.join([dataname, 'deepem-bionlp', '/configs'])) + config_dir = os.path.join(expdir, ''.join([dataname, '/deepem-bionlp', '/configs'])) utils.makedir(config_dir) # default setting From 7b9116e7ed0c91ad4fd0f1709156178d4f284c67 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:23:31 +0900 Subject: [PATCH 31/70] prediction configs --- scripts/generate_bio_configs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index 2ed280e..fde52ce 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -89,7 +89,8 @@ def generate_configs(taskdir, task, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(taskdir, 'configs') + config_dir = os.path.join(taskdir, '/deepem-bionlp/configs') + utils.makedir(config_dir) # default setting From 3c63c93846cb6255a2225fc78bf6773721c919b1 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:26:08 +0900 Subject: [PATCH 32/70] fix bug --- scripts/generate_bio_configs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index fde52ce..2dbb481 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -89,8 +89,7 @@ def generate_configs(taskdir, task, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(taskdir, '/deepem-bionlp/configs') - + config_dir = os.path.join(taskdir, ''.join['/', 'deepem-bionlp', '/', 'configs']) utils.makedir(config_dir) # default setting From 0af84d6c62ac12ce3c4c3bc52f3b4925e6f45f97 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:27:34 +0900 Subject: [PATCH 33/70] fix bug --- scripts/generate_bio_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index 2dbb481..61f0807 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -89,7 +89,7 @@ def generate_configs(taskdir, task, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(taskdir, ''.join['/', 'deepem-bionlp', '/', 'configs']) + config_dir = os.path.join(taskdir, ''.join['deepem-bionlp', '/', 'configs']) utils.makedir(config_dir) # default setting From fed491a2f8066d240c01b488ed5769e344110aeb Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:30:06 +0900 Subject: [PATCH 34/70] fix bug --- scripts/generate_bio_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index 61f0807..08f6a2d 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -89,7 +89,7 @@ def generate_configs(taskdir, task, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(taskdir, ''.join['deepem-bionlp', '/', 'configs']) + config_dir = os.path.join(os.path.join(taskdir, 'deepem-bionlp'), 'configs') utils.makedir(config_dir) # default setting From 0d453c4c712839f9b9d72e340c3b7391e36e5d1d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:31:43 +0900 Subject: [PATCH 35/70] fix path --- scripts/generate_bio_configs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index 08f6a2d..6a584e1 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -128,7 +128,8 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(expdir, ''.join([dataname, '/deepem-bionlp', '/configs'])) + config_dir = os.path.join(os.path.join(expdir, dataname), 'deepem-bionlp') + config_dir = os.path.join(config_dir, 'configs') utils.makedir(config_dir) # default setting From 1fa3839af70b81c05166f443eccb41633ba51f00 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:34:02 +0900 Subject: [PATCH 36/70] output path --- scripts/generate_bio_configs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index 6a584e1..b0291e4 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -27,7 +27,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo # dev and test sets if eval_set == 'dev' or eval_set == 'test': predict_config['test_data'] = ''.join(["data/corpora/", model_name, "/", eval_set, "/"]) - predict_config['result_dir'] = ''.join([taskdir, 'deepem-bionlp', '/predict-gold-', eval_set, '/']) + predict_config['result_dir'] = ''.join([taskdir, '/deepem-bionlp', '/predict-gold-', eval_set, '/']) # overwrite task config overwrite_task_config(predict_config, specific_config) @@ -37,7 +37,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo # for raw texts elif eval_set == 'raw-text': predict_config['test_data'] = ''.join(["data/processed-raw-text/", model_name, "/"]) - predict_config['result_dir'] = ''.join([taskdir, 'deepem-bionlp', '/predict-', eval_set, '/']) + predict_config['result_dir'] = ''.join([taskdir, '/deepem-bionlp', '/predict-', eval_set, '/']) predict_config['raw_text'] = True predict_config['ner_predict_all'] = True @@ -49,7 +49,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo def gen_predict_config_pubmed(predict_config, specific_config, config_dir, expdir, dataname): predict_config['test_data'] = ''.join(["data/", dataname, "/processed-text/", "text/"]) - predict_config['result_dir'] = ''.join([expdir, dataname, 'deepem-bionlp', '/results/']) + predict_config['result_dir'] = ''.join([expdir, dataname, '/deepem-bionlp', '/results/']) predict_config['raw_text'] = True predict_config['ner_predict_all'] = True From 5495e835bb7dd5950e0ae69246579f6b23cff16f Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:35:35 +0900 Subject: [PATCH 37/70] update config path --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index c089a95..d420589 100644 --- a/run.sh +++ b/run.sh @@ -22,7 +22,7 @@ elif [ "$TASK" = "predict" ]; then DEV_TEST=$4 # predict for dev, test sets # predict - python predict.py --yaml $TASK_DIR/configs/$TASK-$GOLD_E2E-$DEV_TEST.yaml + python predict.py --yaml $TASK_DIR/deepem-bionlp/configs/$TASK-$GOLD_E2E-$DEV_TEST.yaml # retrieve offset elif [ "$TASK" = "offset" ]; then From 1c41fda78f505c7f68c8213237ba363f14fc6fe7 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:42:16 +0900 Subject: [PATCH 38/70] load saved parameters --- configs/bio-default.yaml | 2 +- predict.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/bio-default.yaml b/configs/bio-default.yaml index 67c56ca..d50278a 100644 --- a/configs/bio-default.yaml +++ b/configs/bio-default.yaml @@ -5,7 +5,7 @@ task_name: cg # model bert_model: data/bert/scibert_scivocab_cased model_path: data/models/cg/model/ -saved_params: data/models/cg/cg.param +params: data/models/cg/cg.param # data test_data: .. diff --git a/predict.py b/predict.py index d237630..7552a6a 100644 --- a/predict.py +++ b/predict.py @@ -44,7 +44,6 @@ def main(): # Load configurations for prediction only test_data_dir = parameters['test_data'] - params_dir = parameters['params'] pipelines = parameters['pipelines'] t_gpu = parameters['t_gpu'] t_fp16 = parameters['t_fp16'] @@ -59,6 +58,7 @@ def main(): bert_model = parameters['bert_model'] # Load pre-trained parameters + params_dir = parameters['params'] with open(params_dir, "rb") as f: parameters = pickle.load(f) From a21989c5c387c9f2defbe65f3c7a860b3adba8de Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:43:27 +0900 Subject: [PATCH 39/70] savaed params config --- scripts/generate_bio_configs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generate_bio_configs.py b/scripts/generate_bio_configs.py index b0291e4..7f18420 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_bio_configs.py @@ -105,7 +105,7 @@ def generate_configs(taskdir, task, gpu): task_config['gpu'] = gpu task_config['task_name'] = task_config['task_name'].replace('cg', task) task_config['model_path'] = task_config['model_path'].replace('cg', task) - task_config['saved_params'] = task_config['saved_params'].replace('cg', task) + task_config['params'] = task_config['params'].replace('cg', task) task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', task) # predict config @@ -145,7 +145,7 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): task_config['gpu'] = gpu task_config['task_name'] = task_config['task_name'].replace('cg', model_name) task_config['model_path'] = task_config['model_path'].replace('cg', model_name) - task_config['saved_params'] = task_config['saved_params'].replace('cg', model_name) + task_config['params'] = task_config['params'].replace('cg', model_name) task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', model_name) # for raw text From 19ceca75ed80e36cc9a1ca0790140c94d3762302 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:51:02 +0900 Subject: [PATCH 40/70] prediction scripts --- predict.py | 2 +- pubmed.sh | 8 +- run.sh | 4 +- ...bio_configs.py => generate_configs_bio.py} | 4 +- scripts/predict_bio.py | 114 ++++++++++++++++++ 5 files changed, 123 insertions(+), 9 deletions(-) rename scripts/{generate_bio_configs.py => generate_configs_bio.py} (97%) create mode 100644 scripts/predict_bio.py diff --git a/predict.py b/predict.py index 7552a6a..d237630 100644 --- a/predict.py +++ b/predict.py @@ -44,6 +44,7 @@ def main(): # Load configurations for prediction only test_data_dir = parameters['test_data'] + params_dir = parameters['params'] pipelines = parameters['pipelines'] t_gpu = parameters['t_gpu'] t_fp16 = parameters['t_fp16'] @@ -58,7 +59,6 @@ def main(): bert_model = parameters['bert_model'] # Load pre-trained parameters - params_dir = parameters['params'] with open(params_dir, "rb") as f: parameters = pickle.load(f) diff --git a/pubmed.sh b/pubmed.sh index 569019d..e9015a6 100644 --- a/pubmed.sh +++ b/pubmed.sh @@ -52,7 +52,7 @@ elif [ "$TASK" = "config" ]; then GPU=$4 EXP_DIR="experiments/" - python scripts/generate_bio_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU + python scripts/generate_configs_bio.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU # predict elif [ "$TASK" = "predict" ]; then @@ -62,7 +62,7 @@ elif [ "$TASK" = "predict" ]; then EXP_DIR="experiments/$MY_DATA" # predict - python predict.py --yaml $EXP_DIR/configs/$TASK-$MY_DATA.yaml + python scripts/predict_bio.py --yaml $EXP_DIR/configs/$TASK-$MY_DATA.yaml # retrieve offset elif [ "$TASK" = "offset" ]; then @@ -147,7 +147,7 @@ elif [ "$TASK" = "e2e" ]; then GPU=$5 EXP_DIR="experiments/" - python scripts/generate_bio_configs.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU + python scripts/generate_configs_bio.py $EXP_DIR $MY_DATA $MODEL_NAME $GPU echo "--------------------------------" echo "4. Predict: " @@ -155,7 +155,7 @@ elif [ "$TASK" = "e2e" ]; then EXP_DIR="experiments/$MY_DATA" # predict - python predict.py --yaml $EXP_DIR/configs/predict-$MY_DATA.yaml + python scripts/predict_bio.py --yaml $EXP_DIR/configs/predict-$MY_DATA.yaml echo "--------------------------------" echo "5. Retrieve original offsets: " diff --git a/run.sh b/run.sh index d420589..6227d6f 100644 --- a/run.sh +++ b/run.sh @@ -12,7 +12,7 @@ if [ "$TASK" = "config" ]; then GPU=$3 - python scripts/generate_bio_configs.py $TASK_DIR $CORPUS_NAME $GPU + python scripts/generate_configs_bio.py $TASK_DIR $CORPUS_NAME $GPU # predict elif [ "$TASK" = "predict" ]; then @@ -22,7 +22,7 @@ elif [ "$TASK" = "predict" ]; then DEV_TEST=$4 # predict for dev, test sets # predict - python predict.py --yaml $TASK_DIR/deepem-bionlp/configs/$TASK-$GOLD_E2E-$DEV_TEST.yaml + python scripts/predict_bio.py --yaml $TASK_DIR/deepem-bionlp/configs/$TASK-$GOLD_E2E-$DEV_TEST.yaml # retrieve offset elif [ "$TASK" = "offset" ]; then diff --git a/scripts/generate_bio_configs.py b/scripts/generate_configs_bio.py similarity index 97% rename from scripts/generate_bio_configs.py rename to scripts/generate_configs_bio.py index 7f18420..b0291e4 100644 --- a/scripts/generate_bio_configs.py +++ b/scripts/generate_configs_bio.py @@ -105,7 +105,7 @@ def generate_configs(taskdir, task, gpu): task_config['gpu'] = gpu task_config['task_name'] = task_config['task_name'].replace('cg', task) task_config['model_path'] = task_config['model_path'].replace('cg', task) - task_config['params'] = task_config['params'].replace('cg', task) + task_config['saved_params'] = task_config['saved_params'].replace('cg', task) task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', task) # predict config @@ -145,7 +145,7 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): task_config['gpu'] = gpu task_config['task_name'] = task_config['task_name'].replace('cg', model_name) task_config['model_path'] = task_config['model_path'].replace('cg', model_name) - task_config['params'] = task_config['params'].replace('cg', model_name) + task_config['saved_params'] = task_config['saved_params'].replace('cg', model_name) task_config['ev_eval_script_path'] = task_config['ev_eval_script_path'].replace('cg', model_name) # for raw text diff --git a/scripts/predict_bio.py b/scripts/predict_bio.py new file mode 100644 index 0000000..fc67343 --- /dev/null +++ b/scripts/predict_bio.py @@ -0,0 +1,114 @@ +import os +import random +import pickle +import numpy as np +import torch +from torch.utils.data import TensorDataset, DataLoader, SequentialSampler + +from eval.evaluate import predict + +from nets import deepEM +from loader.prepData import prepdata +from loader.prepNN import prep4nn +from utils import utils + + +def main(): + # read predict config + # set config path by command line + inp_args = utils._parsing() + config_path = getattr(inp_args, 'yaml') + + # set config path manually + # config_path = 'configs/debug.yaml' + + with open(config_path, 'r') as stream: + pred_params = utils._ordered_load(stream) + + # Fix seed for reproducibility + os.environ["PYTHONHASHSEED"] = str(pred_params['seed']) + random.seed(pred_params['seed']) + np.random.seed(pred_params['seed']) + torch.manual_seed(pred_params['seed']) + + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Load pre-trained parameters + with open(pred_params['saved_params'], "rb") as f: + parameters = pickle.load(f) + + parameters['predict'] = True + + # Set predict settings value for params + parameters['gpu'] = pred_params['gpu'] + parameters['batchsize'] = pred_params['batchsize'] + if parameters['gpu'] >= 0: + device = torch.device("cuda:" + str(parameters['gpu']) if torch.cuda.is_available() else "cpu") + torch.cuda.set_device(parameters['gpu']) + else: + device = torch.device("cpu") + parameters['device'] = device + + # Set evaluation settings + parameters['test_data'] = pred_params['test_data'] + + parameters['bert_model'] = pred_params['bert_model'] + + result_dir = pred_params['result_dir'] + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + parameters['result_dir'] = pred_params['result_dir'] + + # raw text + parameters['raw_text'] = pred_params['raw_text'] + parameters['ner_predict_all'] = pred_params['raw_text'] + parameters['a2_entities'] = pred_params['a2_entities'] + + # process data + test_data = prepdata.prep_input_data(pred_params['test_data'], parameters) + nntest_data, test_dataloader = read_test_data(test_data, parameters) + + # model + deepee_model = deepEM.DeepEM(parameters) + + model_path = pred_params['model_path'] + + # Load all models + utils.handle_checkpoints(model=deepee_model, + checkpoint_dir=model_path, + params={ + 'device': device + }, + resume=True) + + deepee_model.to(device) + + predict(model=deepee_model, + result_dir=result_dir, + eval_dataloader=test_dataloader, + eval_data=nntest_data, + g_entity_ids_=test_data['g_entity_ids_'], + params=parameters) + + # print('Done!') + + +def read_test_data(test_data, params): + test = prep4nn.data2network(test_data, 'predict', params) + + if len(test) == 0: + raise ValueError("Test set empty.") + + test_data = prep4nn.torch_data_2_network(cdata2network=test, params=params, do_get_nn_data=True) + te_data_size = len(test_data['nn_data']['ids']) + + test_data_ids = TensorDataset(torch.arange(te_data_size)) + test_sampler = SequentialSampler(test_data_ids) + test_dataloader = DataLoader(test_data_ids, sampler=test_sampler, batch_size=params['batchsize']) + return test_data, test_dataloader + + +if __name__ == '__main__': + main() From d66972ef00487a763c46e1d8533c4d43643cc46c Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:51:50 +0900 Subject: [PATCH 41/70] fix bug --- configs/bio-default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/bio-default.yaml b/configs/bio-default.yaml index d50278a..67c56ca 100644 --- a/configs/bio-default.yaml +++ b/configs/bio-default.yaml @@ -5,7 +5,7 @@ task_name: cg # model bert_model: data/bert/scibert_scivocab_cased model_path: data/models/cg/model/ -params: data/models/cg/cg.param +saved_params: data/models/cg/cg.param # data test_data: .. From 7a875077b68ed90b5731176a5e96966ededd4352 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:53:10 +0900 Subject: [PATCH 42/70] python path --- pubmed.sh | 4 ++++ run.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pubmed.sh b/pubmed.sh index e9015a6..76e3549 100644 --- a/pubmed.sh +++ b/pubmed.sh @@ -1,5 +1,9 @@ #!/bin/bash +ROOT=$PWD +export PYTHONPATH="${PYTHONPATH}:$ROOT" +export PYTHONPATH="${PYTHONPATH}:$ROOT/eval" + TASK=$1 # Get Text from PubMed ID & PMC ID diff --git a/run.sh b/run.sh index 6227d6f..d6e57a8 100644 --- a/run.sh +++ b/run.sh @@ -1,5 +1,9 @@ #!/bin/bash +ROOT=$PWD +export PYTHONPATH="${PYTHONPATH}:$ROOT" +export PYTHONPATH="${PYTHONPATH}:$ROOT/eval" + TASK=$1 # generate configs, predict, retrieve offsets, evaluate CORPUS_NAME=$2 # cg, ge11, pc, etc From babfeb31db7fd20470d67fa22349631cad703ca3 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:55:44 +0900 Subject: [PATCH 43/70] bionlp prediction --- eval/evaluate.py | 164 +++++++++++++++++++++++++++++++++++++++++ scripts/predict_bio.py | 4 +- 2 files changed, 166 insertions(+), 2 deletions(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 7033e98..f3a01c2 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -504,3 +504,167 @@ def print_scores(k, v, stoso): v['macro'][stoso + '_p'], v['macro'][stoso + '_r'], v['macro'][stoso + '_f']), end="", ) print() + +def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params): + mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping'] + + # store predicted entities + ent_preds = [] + + # store predicted events + ev_preds = [] + + fidss, wordss, offsetss, sub_to_wordss, span_indicess = [], [], [], [], [] + + # entity and relation output + ent_anns = [] + rel_anns = [] + + # Evaluation phase + model.eval() + + all_ner_preds, all_ner_golds, all_ner_terms = [], [], [] + + is_eval_ev = False + + for step, batch in enumerate( + tqdm(eval_dataloader, desc="Iteration", leave=False) + ): + eval_data_ids = batch + tensors = utils.get_tensors(eval_data_ids, eval_data, params) + + nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, _, \ + etypes, _ = tensors + + fids = [ + eval_data["fids"][data_id] for data_id in eval_data_ids[0].tolist() + ] + offsets = [ + eval_data["offsets"][data_id] + for data_id in eval_data_ids[0].tolist() + ] + words = [ + eval_data["words"][data_id] for data_id in eval_data_ids[0].tolist() + ] + sub_to_words = [ + eval_data["sub_to_words"][data_id] + for data_id in eval_data_ids[0].tolist() + ] + subwords = [ + eval_data["subwords"][data_id] + for data_id in eval_data_ids[0].tolist() + ] + gold_entities = [ + eval_data["entities"][data_id] + for data_id in eval_data_ids[0].tolist() + ] + + with torch.no_grad(): + ner_out, rel_out, ev_out = model(tensors, params) + + ner_preds = ner_out['preds'] + + ner_terms = ner_out['terms'] + + all_ner_terms.append(ner_terms) + + for sentence_idx, ner_pred in enumerate(ner_preds): + all_ner_golds.append( + [ + ( + sub_to_words[sentence_idx][span_start], + sub_to_words[sentence_idx][span_end], + mapping_id_tag[label_id], + ) + for ( + span_start, + span_end, + ), label_ids in gold_entities[sentence_idx].items() + for label_id in label_ids + ] + ) + + pred_entities = [] + for span_id, ner_pred_id in enumerate(ner_pred): + span_start, span_end = nn_span_indices[sentence_idx][span_id] + span_start, span_end = span_start.item(), span_end.item() + if (ner_pred_id > 0 + and span_start in sub_to_words[sentence_idx] + and span_end in sub_to_words[sentence_idx] + ): + pred_entities.append( + ( + sub_to_words[sentence_idx][span_start], + sub_to_words[sentence_idx][span_end], + mapping_id_tag[ner_pred_id], + ) + ) + all_ner_preds.append(pred_entities) + + # entity prediction + ent_ann = {'span_indices': nn_span_indices, 'ner_preds': ner_out['preds'], 'words': words, + 'offsets': offsets, 'sub_to_words': sub_to_words, 'subwords': subwords, + 'ner_terms': ner_terms} + ent_anns.append(ent_ann) + + fidss.append(fids) + + wordss.append(words) + offsetss.append(offsets) + sub_to_wordss.append(sub_to_words) + + # relation prediction + if rel_out != None: + pairs_idx = rel_out['pairs_idx'] + rel_pred = rel_out['preds'] + + rel_ann = {'pairs_idx': pairs_idx, 'rel_preds': rel_pred} + rel_anns.append(rel_ann) + else: + rel_anns.append({}) + + # event prediction + if ev_out != None: + # add predicted entity + ent_preds.append(ner_out["nner_preds"]) + + # add predicted events + ev_preds.append(ev_out) + + span_indicess.append( + [ + indice.detach().cpu().numpy() + for indice in ner_out["span_indices"] + ] + ) + is_eval_ev = True + else: + ent_preds.append([]) + ev_preds.append([]) + + span_indicess.append([]) + + # Clear GPU unused RAM: + if params['gpu'] >= 0: + torch.cuda.empty_cache() + # write entity and relation prediction + _ = write_entity_relations( + result_dir=result_dir, + fidss=fidss, + ent_anns=ent_anns, + rel_anns=rel_anns, + params=params + ) + + if is_eval_ev > 0: + write_events(fids=fidss, + all_ent_preds=ent_preds, + all_words=wordss, + all_offsets=offsetss, + all_span_terms=all_ner_terms, + all_span_indices=span_indicess, + all_sub_to_words=sub_to_wordss, + all_ev_preds=ev_preds, + g_entity_ids_=g_entity_ids_, + params=params, + result_dir=result_dir) \ No newline at end of file diff --git a/scripts/predict_bio.py b/scripts/predict_bio.py index fc67343..065845a 100644 --- a/scripts/predict_bio.py +++ b/scripts/predict_bio.py @@ -5,7 +5,7 @@ import torch from torch.utils.data import TensorDataset, DataLoader, SequentialSampler -from eval.evaluate import predict +from eval.evaluate import predict_bio from nets import deepEM from loader.prepData import prepdata @@ -85,7 +85,7 @@ def main(): deepee_model.to(device) - predict(model=deepee_model, + predict_bio(model=deepee_model, result_dir=result_dir, eval_dataloader=test_dataloader, eval_data=nntest_data, From c44d387a19b99c4776b24436e0887cb4445dcd88 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:56:18 +0900 Subject: [PATCH 44/70] fix path --- scripts/predict_bio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/predict_bio.py b/scripts/predict_bio.py index 065845a..76ad8d4 100644 --- a/scripts/predict_bio.py +++ b/scripts/predict_bio.py @@ -7,7 +7,7 @@ from eval.evaluate import predict_bio -from nets import deepEM +from model import deepEM from loader.prepData import prepdata from loader.prepNN import prep4nn from utils import utils From 246a1056141dd34f6855d2a565018b08bfa1292d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:57:52 +0900 Subject: [PATCH 45/70] setup --- README.md | 2 +- requirements.txt => setup/requirements.txt | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename requirements.txt => setup/requirements.txt (100%) diff --git a/README.md b/README.md index 85296a6..4111ed0 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ sh setup/conda-install.sh - Python dependencies ```bash -pip install -r requirements.txt +pip install -r setup/requirements.txt ``` 5. [Brat](https://github.com/nlplab/brat) for visualization diff --git a/requirements.txt b/setup/requirements.txt similarity index 100% rename from requirements.txt rename to setup/requirements.txt From 5eec9122a805851e9936fbb6f29e1254d13ca5f7 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 00:59:00 +0900 Subject: [PATCH 46/70] sklearn version --- setup/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup/requirements.txt b/setup/requirements.txt index d31bf44..b84c69c 100644 --- a/setup/requirements.txt +++ b/setup/requirements.txt @@ -7,7 +7,7 @@ loguru tabulate pyyaml texttable -sklearn +sklearn==0.23.2 pytorch-nlp tqdm requests From 7018136da5539b0c5a2a0e4507b423799492081f Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:00:55 +0900 Subject: [PATCH 47/70] sklearn --- setup/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup/requirements.txt b/setup/requirements.txt index b84c69c..9fc92d4 100644 --- a/setup/requirements.txt +++ b/setup/requirements.txt @@ -7,7 +7,7 @@ loguru tabulate pyyaml texttable -sklearn==0.23.2 +scikit-learn==0.23.2 pytorch-nlp tqdm requests From c803f2c708c167838b26b0bac6f3ee82cc7e124b Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:04:26 +0900 Subject: [PATCH 48/70] fix bug --- scripts/predict_bio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/predict_bio.py b/scripts/predict_bio.py index 76ad8d4..afc6544 100644 --- a/scripts/predict_bio.py +++ b/scripts/predict_bio.py @@ -96,7 +96,7 @@ def main(): def read_test_data(test_data, params): - test = prep4nn.data2network(test_data, 'predict', params) + test, _ = prep4nn.data2network(test_data, 'predict', params) if len(test) == 0: raise ValueError("Test set empty.") From 12205e394eb1957e400bb0240ba77c6125d391c9 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:06:32 +0900 Subject: [PATCH 49/70] fix bug --- scripts/predict_bio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/predict_bio.py b/scripts/predict_bio.py index afc6544..5f318b0 100644 --- a/scripts/predict_bio.py +++ b/scripts/predict_bio.py @@ -96,12 +96,12 @@ def main(): def read_test_data(test_data, params): - test, _ = prep4nn.data2network(test_data, 'predict', params) + test, test_events_map = prep4nn.data2network(test_data, 'predict', params) if len(test) == 0: raise ValueError("Test set empty.") - test_data = prep4nn.torch_data_2_network(cdata2network=test, params=params, do_get_nn_data=True) + test_data = prep4nn.torch_data_2_network(cdata2network=test, events_map=test_events_map, params=params, do_get_nn_data=True) te_data_size = len(test_data['nn_data']['ids']) test_data_ids = TensorDataset(torch.arange(te_data_size)) From d740223002d7c6bbccc696afbebd14d9d308c608 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:10:31 +0900 Subject: [PATCH 50/70] process input --- loader/prepData/prepdata.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/loader/prepData/prepdata.py b/loader/prepData/prepdata.py index cb5a6f0..d0f222d 100644 --- a/loader/prepData/prepdata.py +++ b/loader/prepData/prepdata.py @@ -51,5 +51,18 @@ def prep_input_data(files_fold, params): if diff: print(doc_name, sorted(diff, key=lambda _id: int(_id.replace("T", "")))) + # entity indices + g_entity_ids_ = OrderedDict() + for fid, fdata in entities0.items(): + # get max entity id + eid_ = [eid for eid in fdata['ids'] if not eid.startswith('TR')] + ids_ = [int(eid.replace('T', '')) for eid in eid_] + if len(ids_) > 0: + max_id = max(ids_) + else: + max_id = 0 + eid_.append(max_id) + g_entity_ids_[fid] = eid_ + return {'entities': entities1, 'triggers': triggers1, 'terms': terms0, 'relations': relations0, 'events': events0, - 'sentences': sentences1, 'input': input1, 'structsTR': structsTR} + 'sentences': sentences1, 'input': input1, 'structsTR': structsTR, 'g_entity_ids_': g_entity_ids_} From 9666202b7b36f84e2e10ca9013c467d1ef607de4 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:11:14 +0900 Subject: [PATCH 51/70] fix bug --- loader/prepData/prepdata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loader/prepData/prepdata.py b/loader/prepData/prepdata.py index d0f222d..8204a2e 100644 --- a/loader/prepData/prepdata.py +++ b/loader/prepData/prepdata.py @@ -1,4 +1,5 @@ """Load data from brat format and process for entity, trigger, relation, events.""" +from collections import OrderedDict from loader.prepData.brat import brat_loader from loader.prepData.sentence import prep_sentence_offsets, process_input From e3b676321f778e4926f04e9f551476f1d7d82b65 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:15:27 +0900 Subject: [PATCH 52/70] fix bug --- eval/evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index f3a01c2..2ea8115 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -533,8 +533,8 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa eval_data_ids = batch tensors = utils.get_tensors(eval_data_ids, eval_data, params) - nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, _, \ - etypes, _ = tensors + nn_tokens, nn_ids, nn_token_mask, nn_attention_mask, nn_span_indices, nn_span_labels, nn_span_labels_match_rel, nn_entity_masks, nn_trigger_masks, nn_gtruth, nn_l2r, _, \ + nn_truth_ev, nn_ev_idxs, ev_lbls, etypes, _ = tensors fids = [ eval_data["fids"][data_id] for data_id in eval_data_ids[0].tolist() From a4c5deaa5c701a110e2bd4336a0f0834b2dd439c Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:21:13 +0900 Subject: [PATCH 53/70] nested events in prediction --- model/EVNet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model/EVNet.py b/model/EVNet.py index 440b3b2..0fc2970 100644 --- a/model/EVNet.py +++ b/model/EVNet.py @@ -638,10 +638,10 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch): all_preds_output = [] # flag to train nested event or not - if n_epoch >= self.params['ev_nested_epoch'] or self.params['predict']: - enable_nested_ev = True - else: - enable_nested_ev = False + enable_nested_ev = True + if not self.params['predict']: + if n_epoch < self.params['ev_nested_epoch']: + enable_nested_ev = False # flag to train modality or not if n_epoch >= self.params['modality_epoch'] or self.params['predict']: From f879ca746c258dce3c8696c413f95c96f6c5de77 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:23:42 +0900 Subject: [PATCH 54/70] modality in prediction --- model/EVNet.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/model/EVNet.py b/model/EVNet.py index 0fc2970..5ba06d2 100644 --- a/model/EVNet.py +++ b/model/EVNet.py @@ -637,17 +637,14 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch): # store output all_preds_output = [] - # flag to train nested event or not + # flag to train nested event, train modality or not enable_nested_ev = True + enable_modality = True if not self.params['predict']: if n_epoch < self.params['ev_nested_epoch']: enable_nested_ev = False - - # flag to train modality or not - if n_epoch >= self.params['modality_epoch'] or self.params['predict']: - enable_modality = True - else: - enable_modality = False + if n_epoch < self.params['modality_epoch']: + enable_modality = False # store all predictions for flat and nested, maximum as 3 nested levels # TODO: revise the maximum nested level later. Now fix 3 levels From 2eecdb167c692609cc77516d147d524ea5b75a2d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:25:51 +0900 Subject: [PATCH 55/70] fix bug --- eval/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 2ea8115..344cd8b 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -560,7 +560,7 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa ] with torch.no_grad(): - ner_out, rel_out, ev_out = model(tensors, params) + ner_out, rel_out, ev_out, _ = model(tensors, params) ner_preds = ner_out['preds'] From 98f5fe8d4b03b7e20a4619131d63ce173130dfce Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:35:00 +0900 Subject: [PATCH 56/70] write annotations --- eval/evaluate.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 344cd8b..783dc4b 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -4,7 +4,7 @@ from tqdm import tqdm from eval.evalEV import evaluate_ev -from eval.evalRE import estimate_perf, estimate_rel +from eval.evalRE import estimate_perf, estimate_rel, gen_annotation from eval.evalNER import eval_nner from scripts.pipeline_process import gen_ner_ann_files, gen_rel_ann_files from utils import utils @@ -648,13 +648,14 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa if params['gpu'] >= 0: torch.cuda.empty_cache() # write entity and relation prediction - _ = write_entity_relations( - result_dir=result_dir, - fidss=fidss, - ent_anns=ent_anns, - rel_anns=rel_anns, - params=params - ) + gen_annotation(fidss, ent_anns, rel_anns, params, result_dir) + # _ = write_entity_relations( + # result_dir=result_dir, + # fidss=fidss, + # ent_anns=ent_anns, + # rel_anns=rel_anns, + # params=params + # ) if is_eval_ev > 0: write_events(fids=fidss, From 4c913d0c5bc4acfa985bcfa2f2b5a5591f40faeb Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:43:02 +0900 Subject: [PATCH 57/70] write output --- eval/evaluate.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 783dc4b..eb093e0 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -505,6 +505,7 @@ def print_scores(k, v, stoso): ) print() + def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params): mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping'] @@ -658,14 +659,26 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa # ) if is_eval_ev > 0: - write_events(fids=fidss, - all_ent_preds=ent_preds, - all_words=wordss, - all_offsets=offsetss, - all_span_terms=all_ner_terms, - all_span_indices=span_indicess, - all_sub_to_words=sub_to_wordss, - all_ev_preds=ev_preds, - g_entity_ids_=g_entity_ids_, - params=params, - result_dir=result_dir) \ No newline at end of file + # write_events(fids=fidss, + # all_ent_preds=ent_preds, + # all_words=wordss, + # all_offsets=offsetss, + # all_span_terms=all_ner_terms, + # all_span_indices=span_indicess, + # all_sub_to_words=sub_to_wordss, + # all_ev_preds=ev_preds, + # g_entity_ids_=g_entity_ids_, + # params=params, + # result_dir=result_dir) + + _ = evaluate_ev(fids=fidss, + all_ent_preds=ent_preds, + all_words=wordss, + all_offsets=offsetss, + all_span_terms=all_ner_terms, + all_span_indices=span_indicess, + all_sub_to_words=sub_to_wordss, + all_ev_preds=ev_preds, + params=params, + gold_dir=eval_dir, + result_dir=result_dir) From 1ff214595bc458b996a03090152468dc5ab6f90c Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 01:51:03 +0900 Subject: [PATCH 58/70] fix bug --- eval/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index eb093e0..e80ce6e 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -680,5 +680,5 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa all_sub_to_words=sub_to_wordss, all_ev_preds=ev_preds, params=params, - gold_dir=eval_dir, + gold_dir=eval_data, result_dir=result_dir) From aad6562507b0d0fc5897765eaf6e6533cd69d8fb Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:06:06 +0900 Subject: [PATCH 59/70] write output for prediction --- eval/evalEV.py | 192 +++++++++++++++++++++++++++++++++++++++++++++++ eval/evalRE.py | 173 ++++++++++++++++++++++++++++++++++++++++++ eval/evaluate.py | 53 +++++-------- 3 files changed, 385 insertions(+), 33 deletions(-) diff --git a/eval/evalEV.py b/eval/evalEV.py index 7590b4a..7cea518 100644 --- a/eval/evalEV.py +++ b/eval/evalEV.py @@ -500,3 +500,195 @@ def extract_fscore(path): return {'sub_scores': (float(sub_precision.strip()), float(sub_recall.strip()), float(sub_fscore.strip())), 'mod_scores': (float(mod_precision.strip()), float(mod_recall.strip()), float(mod_fscore.strip())), 'tot_scores': (float(tot_precision.strip()), float(tot_recall.strip()), float(tot_fscore.strip()))} + +# write events to file +def write_ev_2file_bio(pred_output, pred_ents, result_dir, g_entity_ids_, params): + a2dir = result_dir + 'ev-last/ev-tok-a2/' + anndir = result_dir + 'ev-last/ev-tok-ann/' + rev_type_map = params['mappings']['rev_type_map'] + + # entity id mapping + # feid_mapping = collections.OrderedDict() + + if not os.path.exists(a2dir): + os.makedirs(a2dir) + else: + os.system('rm ' + a2dir + '*.a2') + + if not os.path.exists(anndir): + os.makedirs(anndir) + else: + os.system('rm ' + anndir + '*.a2') + os.system('rm ' + anndir + '*.a1') + + # write event and triggers, (and entity: if predict both entity and trigger) + for fid, preds in pred_output.items(): + ev_en_preds_ = preds[0] + events = preds[1] + + enid_mapping, en_preds_out_, a2_ents_ = mapping_entity_id(pred_ents[fid], g_entity_ids_[fid], rev_type_map, params) + + # entity and trigger for ann file + ann_en_lines = [] + ann_tr_lines = [] + # write entity and trigger from entity predictions + for pr_id, e_pred in pred_ents[fid].items(): + e0_id = e_pred[0] + e_id = enid_mapping[e0_id] + + output = ''.join( + [e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t', + e_pred[3], '\n']) + + if e0_id.startswith('TR'): + ann_tr_lines.append(output) + + # only write entity to a1 + elif e0_id.startswith('T'): + + # entity in a2 + if e0_id in a2_ents_: + ann_tr_lines.append(output) + + else: + ann_en_lines.append(output) + + + # entity and trigger output for a2 + a2_en_lines_ = [] + a2_tr_lines_ = [] + + # write entity and trigger only included event predictions + # write entity and then trigger + for e_pred in ev_en_preds_: + e0_id = e_pred[0] + e_id = enid_mapping[e0_id] + + output = ''.join( + [e_id, '\t', rev_type_map[e_pred[1]], ' ', str(e_pred[2][0]), ' ', str(e_pred[2][1]), '\t', + e_pred[3], '\n']) + + if e0_id.startswith('TR'): + a2_tr_lines_.append(output) + elif e0_id.startswith('T'): + + # entity in a2 + if e0_id in a2_ents_: + a2_tr_lines_.append(output) + + else: + a2_en_lines_.append(output) + + # event output + ev_lines = [] + + # count event id + f_evid = 0 + + # mapping event id to incremental id + f_evid_map = collections.OrderedDict() + + # store modality + mod_list = [] + + for event_ in events: + + # create event id + evid = convert_evid_to_number(event_[0]) + + # lookup in the map or create a new id + if evid in f_evid_map: + evid_out = f_evid_map[evid] + else: + f_evid += 1 + evid_out = f_evid + f_evid_map[evid] = evid_out + + trid = event_[1][0] + trid = enid_mapping[trid] + typeEV = rev_type_map[event_[1][1]] + args_data = event_[2] + mod_pred = event_[3] + + args_output = '' + for arg_ in args_data: + + # relation type + typeR = arg_[0] + + # check event or entity argument + if len(arg_) > 2: + argIdE = arg_[1] + nest_evid = convert_evid_to_number(argIdE) + if nest_evid in f_evid_map: + nest_evid_out = f_evid_map[nest_evid] + eid = 'E' + str(nest_evid_out) + else: + print('ERROR: NESTED EVENT BUT MISSING EVENT ARGUMENT.') + + # entity argument + else: + a2data = arg_[1] + eid = a2data[0] + + # mapping entity id: predict entity or entity in a2 + if params['ner_predict_all'] or eid in a2_ents_: + eid = enid_mapping[eid] + + if len(args_output) > 0: + args_output += ' ' + + args_output += typeR + ':' + eid + + # if has argument + if len(args_output) > 0: + output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, ' ', args_output, '\n']) + ev_lines.append(output) + + # no argument + else: + output = ''.join(['E', str(evid_out), '\t', typeEV, ':', trid, '\n']) + ev_lines.append(output) + + # check and store modality + if mod_pred > 1: + mod_value = params['mappings']['rev_modality_map'][mod_pred] + mod_list.append([mod_value, evid_out]) + + # write modality + if len(mod_list) > 0: + for mod_id, mod_data in enumerate(mod_list): + mod_type = mod_data[0] + evid_out = mod_data[1] + output = ''.join(['M', str(mod_id + 1), '\t', mod_type, ' ', 'E', str(evid_out), '\n']) + ev_lines.append(output) + + # write a2 files + with open(a2dir + fid + '.a2', 'w') as o2file: + + # write entity + if params['ner_predict_all']: + for entity in a2_en_lines_: + o2file.write(entity) + + for trigger in a2_tr_lines_: + o2file.write(trigger) + for event in ev_lines: + o2file.write(event) + + # write ann file + with open(anndir + fid + '.a1', 'w') as o1file: + for entity in ann_en_lines: + o1file.write(entity) + + with open(anndir + fid + '.a2', 'w') as annfile: + for entity in ann_en_lines: + annfile.write(entity) + for trigger in ann_tr_lines: + annfile.write(trigger) + + # events are the same for both a2 and ann + for event in ev_lines: + annfile.write(event) + + return \ No newline at end of file diff --git a/eval/evalRE.py b/eval/evalRE.py index ba3c077..cc2b3a3 100644 --- a/eval/evalRE.py +++ b/eval/evalRE.py @@ -508,3 +508,176 @@ def extract_fscore(path): 'so_p': float(soft_p.strip()) * 100} return report + +def write_entity_relations(result_dir, fidss, ent_anns, rel_anns, params): + # def gen_annotation(fidss, ent_anns, rel_anns, params, result_dir): + """Generate entity and relation prediction""" + + dir2wr = ''.join([result_dir, 'rel-last/rel-ann/']) + if not os.path.exists(dir2wr): + os.makedirs(dir2wr) + else: + os.system('rm ' + dir2wr + '*.ann') + + # Initial ent+rel map + map = defaultdict() + + for fids in fidss: + for fid in fids: + map[fid] = {'ents': {}, 'rels': {}} + + for xi, (fids, ent_ann, rel_ann) in enumerate(zip(fidss, ent_anns, rel_anns)): + # Mapping entities + entity_map = defaultdict() + for xb, (fid) in enumerate(fids): + span_indices = ent_ann['span_indices'][xb] + ner_terms = ent_ann['ner_terms'][xb] + ner_preds = ent_ann['ner_preds'][xb] + words = ent_ann['words'][xb] + offsets = ent_ann['offsets'][xb] + sub_to_words = ent_ann['sub_to_words'][xb] + + entities = map[fid]['ents'] + + for x, pair in enumerate(span_indices): + if pair[0].item() == -1: + break + if ner_preds[x] > 0: + try: + e_id = ner_terms.id2term[x] + e_type = params['mappings']['rev_type_map'][ + params['mappings']['nn_mapping']['tag2type_map'][ner_preds[x]]] + if 'pipeline_entity_org_map' in params: + if e_id in params['pipeline_entity_org_map'][fid]: + e_words, e_offset = params['pipeline_entity_org_map'][fid][e_id] + else: + print(e_id) + e_words, e_offset = get_entity_attrs(pair, words, offsets, sub_to_words) + else: + e_words, e_offset = get_entity_attrs(pair, words, offsets, sub_to_words) + + # save entity map + entity_map[(xb, x)] = ( + ner_preds[x], e_id, e_type, e_words, e_offset) + + # save entity dic info + entities[e_id] = {"id": e_id, "type": e_type, "start": e_offset[0], "end": e_offset[1], + "ref": e_words} + except KeyError as error: + print('pred not map term', error) + if len(rel_ann) > 0: + # Mapping relations + pairs_idx = rel_ann['pairs_idx'] + rel_preds = rel_ann['rel_preds'] + + pairs_idx_i = pairs_idx[0] + pairs_idx_j = pairs_idx[1] + pairs_idx_k = pairs_idx[2] + + for x, i in enumerate(pairs_idx_i): + relations = map[fids[i]]['rels'] + r_count = len(relations) + 1 + + j = pairs_idx_j[x] + k = pairs_idx_k[x] + rel = rel_preds[x].item() + role = params['mappings']['rev_rel_map'][rel].split(":")[1] + if role != 'Other': + + try: + arg1s = entity_map[(i.item(), j.item())] + arg2s = entity_map[(i.item(), k.item())] + + if int(params['mappings']['rev_rel_map'][rel].split(":")[0]) > int( + params['mappings']['rev_rel_map'][rel].split(":")[-1]): + arg1 = arg2s[1] + arg2 = arg1s[1] + else: + arg1 = arg1s[1] + arg2 = arg2s[1] + r_id = 'R' + str(r_count) + r_count += 1 + relations[r_id] = {"id": r_id, "role": role, + "left_arg": {"label": "Arg1", "id": arg1}, + "right_arg": {"label": "Arg2", "id": arg2}} + except KeyError as error: + print('error relation', fids[i], error) + + for fid, ners_rels in map.items(): + write_annotation_file_bio(dir2wr, fid, entities=ners_rels['ents'], + relations=ners_rels['rels']) + + +def mapping_entity_id(entities_): + eid = 1 + enid_mapping = collections.OrderedDict() + en_preds_out_ = [] + + # create mapping for entity id first + for en_id, en_data in entities_.items(): + + if en_id.startswith('TR'): + continue + + elif en_id.startswith('T'): + enid_mapping[en_id] = 'T' + str(eid) + eid += 1 + en_preds_out_.append(en_data) + + # creat mapping for trigger id + for en_id, en_data in entities_.items(): + + if en_id.startswith('TR'): + enid_mapping[en_id] = 'T' + str(eid) + eid += 1 + en_preds_out_.append(en_data) + + return enid_mapping, en_preds_out_ + + +def write_annotation_file_bio(dir2wr, fid, entities=None, relations=None): + re_lines = [] + en_lines = [] + tr_lines = [] + + # entity id mapping + enid_mapping, en_preds_out_ = mapping_entity_id(entities) + + if entities: + for entity in en_preds_out_: + entity_annotation = "{}\t{} {} {}\t{}".format( + enid_mapping[entity["id"]], + entity["type"], + entity["start"], + entity["end"], + entity["ref"], + ) + + re_lines.append(entity_annotation) + + if entity["id"].startswith('TR'): + tr_lines.append(entity_annotation) + + elif entity["id"].startswith('T'): + en_lines.append(entity_annotation) + + if relations: + for relation in relations.values(): + relation_annotation = "{}\t{} {}:{} {}:{}".format( + relation["id"], + relation["role"], + relation["left_arg"]["label"], + enid_mapping[relation["left_arg"]["id"]], + relation["right_arg"]["label"], + enid_mapping[relation["right_arg"]["id"]], + ) + re_lines.append(relation_annotation) + + # write to file + re_file = ''.join([dir2wr, fid, '-RE.ann']) + en_file = ''.join([dir2wr, fid, '-EN.ann']) + tr_file = ''.join([dir2wr, fid, '-TR.ann']) + + write_lines(re_lines, re_file) + write_lines(en_lines, en_file) + write_lines(tr_lines, tr_file) \ No newline at end of file diff --git a/eval/evaluate.py b/eval/evaluate.py index e80ce6e..fda46e5 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -3,8 +3,8 @@ import torch from tqdm import tqdm -from eval.evalEV import evaluate_ev -from eval.evalRE import estimate_perf, estimate_rel, gen_annotation +from eval.evalEV import evaluate_ev, write_ev_2file_bio +from eval.evalRE import estimate_perf, estimate_rel, write_entity_relations from eval.evalNER import eval_nner from scripts.pipeline_process import gen_ner_ann_files, gen_rel_ann_files from utils import utils @@ -649,36 +649,23 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa if params['gpu'] >= 0: torch.cuda.empty_cache() # write entity and relation prediction - gen_annotation(fidss, ent_anns, rel_anns, params, result_dir) - # _ = write_entity_relations( - # result_dir=result_dir, - # fidss=fidss, - # ent_anns=ent_anns, - # rel_anns=rel_anns, - # params=params - # ) + _ = write_entity_relations( + result_dir=result_dir, + fidss=fidss, + ent_anns=ent_anns, + rel_anns=rel_anns, + params=params + ) if is_eval_ev > 0: - # write_events(fids=fidss, - # all_ent_preds=ent_preds, - # all_words=wordss, - # all_offsets=offsetss, - # all_span_terms=all_ner_terms, - # all_span_indices=span_indicess, - # all_sub_to_words=sub_to_wordss, - # all_ev_preds=ev_preds, - # g_entity_ids_=g_entity_ids_, - # params=params, - # result_dir=result_dir) - - _ = evaluate_ev(fids=fidss, - all_ent_preds=ent_preds, - all_words=wordss, - all_offsets=offsetss, - all_span_terms=all_ner_terms, - all_span_indices=span_indicess, - all_sub_to_words=sub_to_wordss, - all_ev_preds=ev_preds, - params=params, - gold_dir=eval_data, - result_dir=result_dir) + write_ev_2file_bio(fids=fidss, + all_ent_preds=ent_preds, + all_words=wordss, + all_offsets=offsetss, + all_span_terms=all_ner_terms, + all_span_indices=span_indicess, + all_sub_to_words=sub_to_wordss, + all_ev_preds=ev_preds, + g_entity_ids_=g_entity_ids_, + params=params, + result_dir=result_dir) From 3c2eca08d41cf3348f156516ae65ba22d8b77bfb Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:09:30 +0900 Subject: [PATCH 60/70] fix bug --- eval/evalRE.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval/evalRE.py b/eval/evalRE.py index cc2b3a3..85b1db5 100644 --- a/eval/evalRE.py +++ b/eval/evalRE.py @@ -1,4 +1,5 @@ import os +import collections from collections import defaultdict import numpy as np From 653de588fc64bab27d2c3d25975292d34241bc57 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:17:04 +0900 Subject: [PATCH 61/70] write output --- eval/evalEV.py | 27 +++++++++++++++++++++++++++ eval/evaluate.py | 4 ++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/eval/evalEV.py b/eval/evalEV.py index 7cea518..bd31e05 100644 --- a/eval/evalEV.py +++ b/eval/evalEV.py @@ -502,6 +502,33 @@ def extract_fscore(path): 'tot_scores': (float(tot_precision.strip()), float(tot_recall.strip()), float(tot_fscore.strip()))} # write events to file + +# generate event output and evaluation +def write_events_bio(fids, all_ent_preds, all_words, all_offsets, all_span_terms, all_span_indices, all_sub_to_words, + all_ev_preds, g_entity_ids_, params, result_dir): + # generate predicted entities + pred_ents = generate_entities(fids=fids, + all_e_preds=all_ent_preds, + all_words=all_words, + all_offsets=all_offsets, + all_span_terms=all_span_terms, + all_span_indices=all_span_indices, + all_sub_to_words=all_sub_to_words, + params=params) + + # generate predicted events + pred_evs = generate_events(fids=fids, + all_ev_preds=all_ev_preds, + params=params) + + # generate event output + preds_output = generate_ev_output(pred_ents, pred_evs, params) + + # write output to file + write_ev_2file_bio(preds_output, pred_ents, result_dir, g_entity_ids_, params) + + return + def write_ev_2file_bio(pred_output, pred_ents, result_dir, g_entity_ids_, params): a2dir = result_dir + 'ev-last/ev-tok-a2/' anndir = result_dir + 'ev-last/ev-tok-ann/' diff --git a/eval/evaluate.py b/eval/evaluate.py index fda46e5..5c4022c 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -3,7 +3,7 @@ import torch from tqdm import tqdm -from eval.evalEV import evaluate_ev, write_ev_2file_bio +from eval.evalEV import evaluate_ev, write_events_bio from eval.evalRE import estimate_perf, estimate_rel, write_entity_relations from eval.evalNER import eval_nner from scripts.pipeline_process import gen_ner_ann_files, gen_rel_ann_files @@ -658,7 +658,7 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa ) if is_eval_ev > 0: - write_ev_2file_bio(fids=fidss, + write_events_bio(fids=fidss, all_ent_preds=ent_preds, all_words=wordss, all_offsets=offsetss, From 6ba0689504f0fdbeb471b6769b0afb43b4e77350 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:40:13 +0900 Subject: [PATCH 62/70] fix bug --- model/EVNet.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/model/EVNet.py b/model/EVNet.py index 5ba06d2..21c6b4d 100644 --- a/model/EVNet.py +++ b/model/EVNet.py @@ -698,6 +698,7 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch): # nested loss nest_ev_loss = 0 + empty_pred = True # loop until stop nested event prediction or no more events predicted, or in limited nested levels while enable_nested_ev and len(current_positive_ids) > 0 and current_nested_level < self.params['max_ev_level']: @@ -726,6 +727,8 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch): current_tr_ids = ev_nest_cand_ids4nn['trids_'] current_truth_ids = ev_nest_cand_ids4nn['truth_ids_'] + empty_pred = False + # check non-empty if len(ev_nest_cand_ids4nn['trids_']) > 0: @@ -778,7 +781,7 @@ def calculate(self, ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch): if enable_modality: ev_loss = ev_loss + mod_losses * self.params['modality_weight'] - return pred_ev_output, ev_loss + return pred_ev_output, ev_loss, empty_pred def forward(self, ner_preds, rel_preds, n_epoch): """Forward. @@ -826,7 +829,9 @@ def forward(self, ner_preds, rel_preds, n_epoch): # 3-embeds, prediction, and loss # check empty if len(ev_ids4nn['ev_cand_ids4nn']['trids_']) > 0: - ev_out, ev_loss = self.calculate(ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch) + ev_out, ev_loss, empty_pred = self.calculate(ent_embeds, rel_embeds, rpred_types, ev_ids4nn, n_epoch) + if empty_pred: + ev_out = None return {'output': ev_out, 'loss': ev_loss} else: From 1150ab28ff6b088f732daad026bc8506b35290eb Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:53:16 +0900 Subject: [PATCH 63/70] fix bug --- eval/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval/evaluate.py b/eval/evaluate.py index 5c4022c..47aa182 100644 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -630,7 +630,7 @@ def predict_bio(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, pa ent_preds.append(ner_out["nner_preds"]) # add predicted events - ev_preds.append(ev_out) + ev_preds.append(ev_out['output']) span_indicess.append( [ From 46a0f93ec7633bc1b782b8b323a5231a6777651f Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:55:00 +0900 Subject: [PATCH 64/70] event prediction --- eval/evalEV.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/eval/evalEV.py b/eval/evalEV.py index bd31e05..b1f77b8 100644 --- a/eval/evalEV.py +++ b/eval/evalEV.py @@ -529,6 +529,56 @@ def write_events_bio(fids, all_ent_preds, all_words, all_offsets, all_span_terms return +def mapping_entity_id(en_preds_, g_entity_ids_, rev_type_map, params): + # if gold entity, starting trigger id from max entity id + 1 + if not params['ner_predict_all'] and len(g_entity_ids_) > 0: + eid = g_entity_ids_[-1] + 1 + + # predict both entity and trigger + else: + eid = 1 + + # mapping + enid_mapping = collections.OrderedDict() + en_preds_out_ = [] + + # entity in a2 + a2_ents_ = [] + + # create mapping for entity id first + for pr_id, en_pred in en_preds_.items(): + + # id + en_id = en_pred[0] + + if en_id.startswith('TR'): + continue + + elif en_id.startswith('T'): + enid_mapping[en_id] = 'T' + str(eid) + eid += 1 + en_preds_out_.append(en_pred) + + # using gold entity but in a2 + if not params['ner_predict_all']: + etype = rev_type_map[en_pred[1]] + + # check entity type in a2 + if etype in params['a2_entities']: + a2_ents_.append(en_id) + + # creat mapping for trigger id + for pr_id, en_pred in en_preds_.items(): + # id + en_id = en_pred[0] + + if en_id.startswith('TR'): + enid_mapping[en_id] = 'T' + str(eid) + eid += 1 + en_preds_out_.append(en_pred) + + return enid_mapping, en_preds_out_, a2_ents_ + def write_ev_2file_bio(pred_output, pred_ents, result_dir, g_entity_ids_, params): a2dir = result_dir + 'ev-last/ev-tok-a2/' anndir = result_dir + 'ev-last/ev-tok-ann/' From f0c2d082015e4df0ba33e56ff12e1d22c707b139 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 02:59:03 +0900 Subject: [PATCH 65/70] data path --- run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run.sh b/run.sh index d6e57a8..f6d4e1a 100644 --- a/run.sh +++ b/run.sh @@ -38,8 +38,8 @@ elif [ "$TASK" = "offset" ]; then # paths REFDIR="data/corpora/$CORPUS_NAME/$DEV_TEST/" # reference gold data - PREDDIR="$TASK_DIR/predict-$GOLD_E2E-$DEV_TEST/ev-last/ev-tok-a2/" - OUTDIR="$TASK_DIR/predict-$GOLD_E2E-$DEV_TEST/ev-last/" # retrieve the original offsets + PREDDIR="$TASK_DIR/deepem-bionlp/predict-$GOLD_E2E-$DEV_TEST/ev-last/ev-tok-a2/" + OUTDIR="$TASK_DIR/deepem-bionlp/predict-$GOLD_E2E-$DEV_TEST/ev-last/" # retrieve the original offsets # retrieve the original offsets and create zip format for online evaluation python scripts/postprocess.py $REFDIR $PREDDIR $OUTDIR $CORPUS_NAME $DEV_TEST @@ -55,7 +55,7 @@ elif [ "$TASK" = "eval" ]; then # paths REFDIR="data/original_corpora/$CORPUS_NAME/$DEV_TEST/" # reference gold data - PREDDIR="$TASK_DIR/predict-$GOLD_E2E-$DEV_TEST/ev-last/ev-orig-a2/" + PREDDIR="$TASK_DIR/deepem-bionlp/predict-$GOLD_E2E-$DEV_TEST/ev-last/ev-orig-a2/" From 94abab331ba7245df88f19a05e746db66d6d66b8 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 03:10:25 +0900 Subject: [PATCH 66/70] install pubmed requirements --- setup/requirements.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup/requirements.txt b/setup/requirements.txt index 9fc92d4..ddeef4b 100644 --- a/setup/requirements.txt +++ b/setup/requirements.txt @@ -12,4 +12,8 @@ pytorch-nlp tqdm requests torch==1.1.0 -torchvision==0.3.0 \ No newline at end of file +torchvision==0.3.0 +git+https://github.com/titipata/pubmed_parser.git +ebooklib +beautifulsoup4 +lxml \ No newline at end of file From c468db2127a4b244a9a8116ec02c2775547b891d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 03:14:34 +0900 Subject: [PATCH 67/70] pubmed configs --- scripts/generate_configs_bio.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/generate_configs_bio.py b/scripts/generate_configs_bio.py index b0291e4..8730797 100644 --- a/scripts/generate_configs_bio.py +++ b/scripts/generate_configs_bio.py @@ -49,7 +49,7 @@ def gen_predict_config(predict_config, specific_config, eval_set, config_dir, mo def gen_predict_config_pubmed(predict_config, specific_config, config_dir, expdir, dataname): predict_config['test_data'] = ''.join(["data/", dataname, "/processed-text/", "text/"]) - predict_config['result_dir'] = ''.join([expdir, dataname, '/deepem-bionlp', '/results/']) + predict_config['result_dir'] = ''.join([expdir, dataname, '/results/']) predict_config['raw_text'] = True predict_config['ner_predict_all'] = True @@ -128,8 +128,7 @@ def generate_configs_pubmed(expdir, dataname, model_name, gpu): """Generate configs for all.""" # create experiment dir - config_dir = os.path.join(os.path.join(expdir, dataname), 'deepem-bionlp') - config_dir = os.path.join(config_dir, 'configs') + config_dir = os.path.join(os.path.join(expdir, dataname), 'configs') utils.makedir(config_dir) # default setting From 2a4c359f47bf92e681e9b179f705f71d48dd30c9 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 03:18:13 +0900 Subject: [PATCH 68/70] predict on pubmed for raw-text --- loader/prepData/brat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loader/prepData/brat.py b/loader/prepData/brat.py index b2ba019..eace03a 100644 --- a/loader/prepData/brat.py +++ b/loader/prepData/brat.py @@ -207,7 +207,7 @@ def brat_loader(files_fold, params): fevents['ids'] = idsE # check empty - if len(idsT) == len(idsTR) == 0: + if len(idsT) == 0 and not params['raw_text']: continue else: From bdd0c9b3653fe5465260357925baf67aa4392e7d Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 03:19:10 +0900 Subject: [PATCH 69/70] raw text config --- configs/default.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/default.yaml b/configs/default.yaml index 8e02934..bdada65 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -120,4 +120,5 @@ gold_eval: False # display options stats: True # print relations -show_macro: False # print result for relations \ No newline at end of file +show_macro: False # print result for relations +raw_text: False \ No newline at end of file From be6f6136cc461d23f43a2bd3ef5779bb23a85116 Mon Sep 17 00:00:00 2001 From: trieuhl Date: Tue, 22 Mar 2022 03:21:53 +0900 Subject: [PATCH 70/70] brat path --- pubmed.sh | 2 +- run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pubmed.sh b/pubmed.sh index 76e3549..dc7c5a3 100644 --- a/pubmed.sh +++ b/pubmed.sh @@ -175,7 +175,7 @@ elif [ "$TASK" = "e2e" ]; then echo "6. Prepare data for brat" PRED_DIR="experiments/$MY_DATA/results/ev-last/$MY_DATA-brat/" - BRAT_DIR="brat/brat-v1.3_Crunchy_Frog/data/" + BRAT_DIR="brat/data/" # annotation file CONFIG="configs/brat/$MODEL_NAME" diff --git a/run.sh b/run.sh index f6d4e1a..3af290d 100644 --- a/run.sh +++ b/run.sh @@ -70,7 +70,7 @@ elif [ "$TASK" = "brat" ]; then DEV_TEST=$4 # predict for dev, test sets PRED_DIR="$TASK_DIR/predict-$GOLD_E2E-$DEV_TEST/ev-last/$CORPUS_NAME-brat/" - BRAT_DIR="brat/brat-v1.3_Crunchy_Frog/data/" + BRAT_DIR="brat/data/" # annotation file CONFIG="configs/brat/$CORPUS_NAME"