From af0da5e85aad528944c93d0b5083b084d430f4d4 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 15 Dec 2022 18:03:16 +0000 Subject: [PATCH 01/61] efficiency test without server --- scripts/evaluate_predictions.py | 138 ++++++++++++++++++++++++++++++++ src/REL/ner/bert_wrapper.py | 11 +++ 2 files changed, 149 insertions(+) create mode 100644 scripts/evaluate_predictions.py create mode 100644 src/REL/ner/bert_wrapper.py diff --git a/scripts/evaluate_predictions.py b/scripts/evaluate_predictions.py new file mode 100644 index 0000000..8e460a6 --- /dev/null +++ b/scripts/evaluate_predictions.py @@ -0,0 +1,138 @@ +import re + + +UNUSED = -1 + + +def get_gold_data(doc): + GOLD_DATA_FILE = "./data/generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv" + entities = [] + + in_file = open(GOLD_DATA_FILE, "r") + for line in in_file: + if re.search(f"^-DOCSTART- \({doc} ", line): + break + for line in in_file: + if re.search(f"^-DOCSTART- ", line): + break + fields = line.strip().split("\t") + if len(fields) > 3: + if fields[1] == "B": + entities.append([fields[2], fields[3]]) + return entities + + +def md_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i): + return gold_entities[gold_i][0].lower() == predicted_entities[predicted_i][0].lower() + + +def el_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i): + return(gold_entities[gold_i][0].lower() == predicted_entities[predicted_i][0].lower() and + gold_entities[gold_i][1].lower() == predicted_entities[predicted_i][1].lower()) + + +def find_correct_els(gold_entities, predicted_entities, gold_links, predicted_links): + for gold_i in range(0, len(gold_entities)): + if gold_links[gold_i] == UNUSED: + for predicted_i in range(0, len(predicted_entities)): + if (predicted_links[predicted_i] == UNUSED and + el_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i)): + gold_links[gold_i] = predicted_i + predicted_links[predicted_i] = gold_i + return gold_links, predicted_links + + +def find_correct_mds(gold_entities, predicted_entities, gold_links, predicted_links): + for gold_i in range(0, len(gold_entities)): + if gold_links[gold_i] == UNUSED: + for predicted_i in range(0, len(predicted_entities)): + if (predicted_links[predicted_i] == UNUSED and + md_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i)): + gold_links[gold_i] = predicted_i + predicted_links[predicted_i] = gold_i + return gold_links, predicted_links + + + +def compare_entities(gold_entities, predicted_entities): + gold_links = len(gold_entities) * [UNUSED] + predicted_links = len(predicted_entities) * [UNUSED] + gold_links, predicted_links = find_correct_els(gold_entities, predicted_entities, gold_links, predicted_links) + gold_links, predicted_links = find_correct_mds(gold_entities, predicted_entities, gold_links, predicted_links) + return gold_links, predicted_links + + +def count_entities(gold_entities, predicted_entities, gold_links, predicted_links): + correct = 0 + wrong_md = 0 + wrong_el = 0 + missed = 0 + for predicted_i in range(0, len(predicted_links)): + if predicted_links[predicted_i] == UNUSED: + wrong_md += 1 + elif predicted_entities[predicted_i][1] == gold_entities[predicted_links[predicted_i]][1]: + correct += 1 + else: + wrong_el += 1 + for gold_i in range(0, len(gold_links)): + if gold_links[gold_i] == UNUSED: + missed += 1 + return correct, wrong_md, wrong_el, missed + + +def compare_and_count_entities(gold_entities, predicted_entities): + gold_links, predicted_links = compare_entities(gold_entities, predicted_entities) + return count_entities(gold_entities, predicted_entities, gold_links, predicted_links) + + +def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + if correct_all + wrong_el_all > 0: + precision_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + wrong_md_all) + recall_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + missed_all) + f1_md = 2 * precision_md * recall_md / ( precision_md + recall_md ) + else: + precision_md = 0 + recall_md = 0 + f1_md = 0 + return precision_md, recall_md, f1_md + + +def compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + if correct_all > 0: + precision_el = 100 * correct_all / (correct_all + wrong_md_all + wrong_el_all) + recall_el = 100 * correct_all / (correct_all + wrong_el_all + missed_all) + f1_el = 2 * precision_el * recall_el / ( precision_el + recall_el ) + else: + precision_el = 0.0 + recall_el = 0 + f1_el = 0 + return precision_el, recall_el, f1_el + + +def print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + precision_md, recall_md, f1_md = compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) + precision_el, recall_el, f1_el = compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) + print("Results: PMD RMD FMD PEL REL FEL: ", end="") + print(f"{precision_md:0.1f}% {recall_md:0.1f}% {f1_md:0.1f}% | ",end="") + print(f"{precision_el:0.1f}% {recall_el:0.1f}% {f1_el:0.1f}%") + return precision_md, recall_md, f1_md, precision_el, recall_el, f1_el + + +def evaluate(predictions): + correct_all = 0 + wrong_md_all = 0 + wrong_el_all = 0 + missed_all = 0 + for doc in predictions: + gold_entities = get_gold_data(doc) + predicted_entities = [] + for mention in predictions[doc]: + predicted_entities.append([mention["mention"], mention["prediction"]]) + #print("GOLD", gold_entities) + #print("PREDICTED", predicted_entities) + correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) + correct_all += correct + wrong_md_all += wrong_md + wrong_el_all += wrong_el + missed_all += missed + print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) diff --git a/src/REL/ner/bert_wrapper.py b/src/REL/ner/bert_wrapper.py new file mode 100644 index 0000000..ea97844 --- /dev/null +++ b/src/REL/ner/bert_wrapper.py @@ -0,0 +1,11 @@ +from transformers import AutoTokenizer, AutoModelForTokenClassification +from transformers import pipeline + +def load_bert_ner(path_or_url): + try: + tokenizer = AutoTokenizer.from_pretrained(path_or_url) + model = AutoModelForTokenClassification.from_pretrained(path_or_url) + return pipeline("ner", model=model, tokenizer=tokenizer) + except Exception: + pass + return From 26072db18ab8a498ea7e5156fd11c42531308718 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 15 Dec 2022 18:03:43 +0000 Subject: [PATCH 02/61] efficiency test without server --- scripts/efficiency_test.py | 126 +++++++++++++++++++++++++------ src/REL/db/base.py | 3 +- src/REL/mention_detection.py | 141 ++++++++++++++++++++++++++++++----- src/REL/server.py | 66 ++++++++++++++-- 4 files changed, 291 insertions(+), 45 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index fc66f54..0f8dc5e 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -1,34 +1,88 @@ +import argparse +import evaluate_predictions +import json import numpy as np import requests from REL.training_datasets import TrainingEvaluationDatasets +parser = argparse.ArgumentParser() +parser.add_argument("--max_docs", help = "number of documents") +parser.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") +parser.add_argument("--split_docs_value", help = "threshold number of tokens to split document") +parser.add_argument("--use_bert_base_cased", help = "use Bert base cased rather than Flair", action="store_true") +parser.add_argument("--use_bert_large_cased", help = "use Bert large cased rather than Flair", action="store_true") +parser.add_argument("--use_bert_base_uncased", help = "use Bert base uncased rather than Flair", action="store_true") +parser.add_argument("--use_bert_large_uncased", help = "use Bert large uncased rather than Flair", action="store_true") +parser.add_argument("--use_server", help = "use server", action="store_true") +parser.add_argument("--wiki_version", help = "Wiki version") +args = parser.parse_args() + np.random.seed(seed=42) -base_url = "/Users/vanhulsm/Desktop/projects/data/" -wiki_version = "wiki_2014" +base_url = "/store/userdata/etjong/REL-erik/data/" +if args.max_docs: + max_docs = int(args.max_docs) +else: + max_docs = 50 +if args.process_sentences: + process_sentences = True +else: + process_sentences = False + +if args.split_docs_value: + split_docs_value = int(args.split_docs_value) +else: + split_docs_value = 0 + +if args.wiki_version: + wiki_version = args.wiki_version +else: + wiki_version = "wiki_2019" + datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()["aida_testB"] -# random_docs = np.random.choice(list(datasets.keys()), 50) +if args.use_server: + use_server = True +else: + use_server = False + +use_bert_base_cased = False +use_bert_large_cased = False +use_bert_base_uncased = False +use_bert_large_uncased = False + +if args.use_bert_base_cased: + use_bert_base_cased = True +elif args.use_bert_large_cased: + use_bert_large_cased = True +elif args.use_bert_base_uncased: + use_bert_base_uncased = True +elif args.use_bert_large_uncased: + use_bert_large_uncased = True + +print(f"max_docs={max_docs} wiki_version={wiki_version} use_bert_base_cased={use_bert_base_cased} use_bert_large_cased={use_bert_large_cased} use_bert_base_uncased={use_bert_base_uncased} use_bert_large_uncased={use_bert_large_uncased} use_server={use_server} process_sentences={process_sentences} split_docs_value={split_docs_value}") -server = True docs = {} +all_results = {} for i, doc in enumerate(datasets): sentences = [] for x in datasets[doc]: if x["sentence"] not in sentences: sentences.append(x["sentence"]) - text = ". ".join([x for x in sentences]) + if len(sentences) == 0: + next - if len(docs) == 50: - print("length docs is 50.") + text = ". ".join([x for x in sentences]) + if len(docs) >= max_docs: + print(f"length docs is {len(docs)}.") print("====================") break if len(text.split()) > 200: docs[doc] = [text, []] # Demo script that can be used to query the API. - if server: + if use_server: myjson = { "text": text, "spans": [ @@ -40,13 +94,26 @@ print(myjson) print("Output API:") - print(requests.post("http://192.168.178.11:1235", json=myjson).json()) + results = requests.post("http://0.0.0.0:5555", json=myjson) print("----------------------------") - + print("results", type(results), results) + print("results.json()", results.json()) + try: + results_list = [] + print("eff_test", results.json()) + for result in results.json(): + results_list.append({ "mention": result[2], "prediction": result[3] }) # Flair + Bert + all_results[doc] = results_list + except json.decoder.JSONDecodeError: + print("The analysis results are not in json format:", str(results)) + all_results[doc] = [] + +if len(all_results) > 0: + evaluate_predictions.evaluate(all_results) # --------------------- Now total -------------------------------- # ------------- RUN SEPARATELY TO BALANCE LOAD-------------------- -if not server: +if not use_server: from time import time import flair @@ -56,27 +123,44 @@ from REL.entity_disambiguation import EntityDisambiguation from REL.mention_detection import MentionDetection - base_url = "C:/Users/mickv/desktop/data_back/" + from REL.ner.bert_wrapper import load_bert_ner - flair.device = torch.device("cuda:0") + flair.device = torch.device("cpu") mention_detection = MentionDetection(base_url, wiki_version) # Alternatively use Flair NER tagger. - tagger_ner = SequenceTagger.load("ner-fast") + if use_bert_base_uncased: + tagger_ner = load_bert_ner("dslim/bert-base-NER-uncased") + elif use_bert_large_uncased: + tagger_ner = load_bert_ner("Jorgeutd/bert-large-uncased-finetuned-ner") + elif use_bert_base_cased: + tagger_ner = load_bert_ner("dslim/bert-base-NER") + elif use_bert_large_cased: + tagger_ner = load_bert_ner("dslim/bert-large-NER") + else: + tagger_ner = SequenceTagger.load("ner-fast") start = time() - mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner) - print("MD took: {}".format(time() - start)) - - # 3. Load model. +# mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner) + mentions_dataset, n_mentions = mention_detection.find_mentions( + docs, + (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased), + process_sentences, + split_docs_value, + tagger_ner) + print("MD took: {} seconds".format(round(time() - start, 2))) + + # 3. Load ED model. config = { "mode": "eval", "model_path": "{}/{}/generated/model".format(base_url, wiki_version), } - model = EntityDisambiguation(base_url, wiki_version, config) + ed_model = EntityDisambiguation(base_url, wiki_version, config) # 4. Entity disambiguation. start = time() - predictions, timing = model.predict(mentions_dataset) - print("ED took: {}".format(time() - start)) + predictions, timing = ed_model.predict(mentions_dataset) + print("ED took: {} seconds".format(round(time() - start, 2))) + + evaluate_predictions.evaluate(predictions) diff --git a/src/REL/db/base.py b/src/REL/db/base.py index 8eec44d..01e3d81 100644 --- a/src/REL/db/base.py +++ b/src/REL/db/base.py @@ -185,7 +185,8 @@ def lookup_wik(self, w, table_name, column): {"word": w}, ).fetchone() res = ( - e if e is None else json.loads(e[0].decode()) if column == "p_e_m" else e[0] + #e if e is None else json.loads(e[0].decode()) if column == "p_e_m" else e[0] + e if e is None else json.loads("".join(chr(int(x, 2)) for x in e[0].split())) if column == "p_e_m" else e[0] ) return res diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index fc552ed..b9dcd91 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -1,6 +1,9 @@ +import re +import sys +from termcolor import colored from flair.data import Sentence from flair.models import SequenceTagger -from segtok.segmenter import split_single +from syntok import segmenter from REL.mention_detection_base import MentionDetectionBase @@ -62,7 +65,18 @@ def format_spans(self, dataset): results[doc] = results_doc return results, total_ment - def split_text(self, dataset, is_flair=False): + + def split_single(self, text): + sentences_as_token_lists = segmenter.analyze(text) + sentences = [] + for paragraph in sentences_as_token_lists: + for sentence in paragraph: + tokens = [ str(token) for token in sentence ] + sentences.append("".join(tokens)) + return sentences + + + def split_text(self, dataset, process_sentences, split_docs_value=0, tagger=None, is_flair=False): """ Splits text into sentences with optional spans (format is a requirement for GERBIL usage). This behavior is required for the default NER-tagger, which during experiments was experienced @@ -76,7 +90,12 @@ def split_text(self, dataset, is_flair=False): processed_sentences = [] for doc in dataset: text, spans = dataset[doc] - sentences = split_single(text) + if process_sentences: + sentences = self.split_single(text) + elif split_docs_value > 0: + sentences = self.split_text_in_parts(text, split_docs_value, tagger) + else: + sentences = [ text ] res[doc] = {} i = 0 @@ -104,7 +123,82 @@ def split_text(self, dataset, is_flair=False): splits.append(splits[-1] + i) return res, processed_sentences, splits - def find_mentions(self, dataset, tagger=None): + + def combine_entities(self, ner_results): + ner_results_out = [] + i = 0 + while i < len(ner_results)-1: + last_end = ner_results[i]["end"] + ner_results_out.append(dict(ner_results[i])) + j = 1 + while i + j < len(ner_results) and (ner_results[i+j]["start"] == last_end or + (ner_results[i+j]["start"] == last_end + 1 and + re.search("^I", ner_results[i+j]["entity"]) and + re.sub("^..", "", ner_results[i+j]["entity"]) == re.sub("^..", "", ner_results[i]["entity"]))): + if ner_results[i+j]["start"] == last_end: + ner_results_out[-1]["word"] += re.sub("^##", "", ner_results[i+j]["word"]) + else: + ner_results_out[-1]["word"] += " " + ner_results[i+j]["word"] + ner_results_out[-1]["end"] = ner_results[i+j]["end"] + last_end = ner_results[i+j]["end"] + j += 1 + i += j + return ner_results_out + + + def split_sentence_in_bert_tokens(self, sentence, tagger): + tokenizer_results = tagger.tokenizer([sentence], return_offsets_mapping=True) # warns if sentence is too long (>512) + input_ids = tokenizer_results["input_ids"][0] + token_spans = tokenizer_results["offset_mapping"][0] + tokens = [ tagger.tokenizer.decode(token_id) for token_id in input_ids ] + return tokens, token_spans + + + + def combine_tokens_to_text(self, token_list): + text = "" + for token in token_list: + if re.search("^##", token): + text += re.sub("^##", "", token) + elif text == "": + text = token + else: + text += " " + token + return text + + + def split_text_in_parts(self, text, split_docs_value, tagger): + """ + Splits text in parts of as most split_docs_value tokens. Texts are split at sentence + boundaries. If a sentence is longer than the limit it will be split in parts of + maximally split_docs_value tokens. + """ + sentences = self.split_single(text) + token_lists = [] + texts = [] + for sentence in sentences: + sentence_tokens, token_spans = self.split_sentence_in_bert_tokens(sentence, tagger) + if len(token_lists) == 0 or (len(token_lists[-1]) + len(sentence_tokens)) > split_docs_value: + token_lists.append([]) + texts.append("") + token_lists[-1].extend(sentence_tokens) + if texts[-1] == "": + texts[-1] = sentence + else: + texts[-1] += sentence + first_split_point = 0 + while len(token_lists[-1]) > split_docs_value: + token_lists.append(list(token_lists[-1])) + token_lists[-2] = token_lists[-2][:split_docs_value] + token_lists[-1] = token_lists[-1][split_docs_value:] + second_split_point = token_spans[-len(token_lists[-1])][0] + texts[-1] = sentence[first_split_point:second_split_point] + texts.append(sentence[second_split_point:]) + first_split_point = second_split_point + return texts + + + def find_mentions(self, dataset, use_bert, process_sentences, split_docs_value, tagger=None): """ Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, it returns the mention, its left/right context and a set of candidates. @@ -117,15 +211,15 @@ def find_mentions(self, dataset, tagger=None): # Verify if Flair, else ngram or custom. is_flair = isinstance(tagger, SequenceTagger) dataset_sentences_raw, processed_sentences, splits = self.split_text( - dataset, is_flair + dataset, process_sentences, split_docs_value, tagger, is_flair ) results = {} total_ment = 0 if is_flair: tagger.predict(processed_sentences) for i, doc in enumerate(dataset_sentences_raw): - contents = dataset_sentences_raw[doc] raw_text = dataset[doc][0] + contents = dataset_sentences_raw[doc] sentences_doc = [v[0] for v in contents.values()] sentences = processed_sentences[splits[i] : splits[i + 1]] result_doc = [] @@ -136,21 +230,34 @@ def find_mentions(self, dataset, tagger=None): ): # Only include offset if using Flair. - if is_flair: - offset = raw_text.find(sentence, cum_sent_length) - + # if is_flair: + # 20220607: no always include + offset = raw_text.find(sentence, cum_sent_length) + if offset < 0: + print(colored(f"sentence not found in text: cannot happen: {sentence}", "red"), file=sys.stderr) + offset = 0 + entity_counter = 0 for entity in ( snt.get_spans("ner") if is_flair - else tagger.predict(snt, processed_sentences) + else self.combine_entities(tagger(snt)) ): - text, start_pos, end_pos, conf, tag = ( - entity.text, - entity.start_position, - entity.end_position, - entity.score, - entity.tag, - ) + if use_bert: + text, start_pos, end_pos, conf, tag = ( + sentence[entity["start"]:entity["end"]], # for BERT + entity["start"], + entity["end"], + entity["score"], + entity["entity"], + ) + else: + text, start_pos, end_pos, conf, tag = ( + entity.text, # for Flair + entity.start_position, + entity.end_position, + entity.score, + entity.tag, + ) total_ment += 1 m = self.preprocess_mention(text) cands = self.get_candidates(m) diff --git a/src/REL/server.py b/src/REL/server.py index d26d6a9..8af18a5 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -10,14 +10,17 @@ -def make_handler(base_url, wiki_version, model, tagger_ner): +def make_handler(base_url, wiki_version, ed_model, tagger_ner, use_bert, process_sentences, split_docs_value=0): """ Class/function combination that is used to setup an API that can be used for e.g. GERBIL evaluation. """ class GetHandler(BaseHTTPRequestHandler): def __init__(self, *args, **kwargs): - self.model = model + self.ed_model = ed_model self.tagger_ner = tagger_ner + self.use_bert = use_bert + self.process_sentences = process_sentences + self.split_docs_value = split_docs_value self.base_url = base_url self.wiki_version = wiki_version @@ -98,6 +101,17 @@ def read_json(self, post_data): return text, spans + def convert_bert_result(self, result): + new_result = {} + for doc_key in result: + new_result[doc_key] = [] + for mention_data in result[doc_key]: + new_result[doc_key].append(list(mention_data)) + new_result[doc_key][-1][2], new_result[doc_key][-1][3] =\ + new_result[doc_key][-1][3], new_result[doc_key][-1][2] + new_result[doc_key][-1] = tuple(new_result[doc_key][-1]) + return new_result + def generate_response(self, text, spans): """ Generates response for API. Can be either ED only or EL, meaning end-to-end. @@ -118,11 +132,11 @@ def generate_response(self, text, spans): # EL processed = {API_DOC: [text, spans]} mentions_dataset, total_ment = self.mention_detection.find_mentions( - processed, self.tagger_ner + processed, self.use_bert, self.process_sentences, self.split_docs_value, self.tagger_ner ) # Disambiguation - predictions, timing = self.model.predict(mentions_dataset) + predictions, timing = self.ed_model.predict(mentions_dataset) # Process result. result = process_results( @@ -131,6 +145,7 @@ def generate_response(self, text, spans): processed, include_offset=False if ((len(spans) > 0) or self.custom_ner) else True, ) + # result = self.convert_bert_result(result) # Singular document. if len(result) > 0: @@ -147,6 +162,7 @@ def generate_response(self, text, spans): from REL.entity_disambiguation import EntityDisambiguation from REL.ner import load_flair_ner + from REL.ner.bert_wrapper import load_bert_ner p = argparse.ArgumentParser() p.add_argument("base_url") @@ -155,16 +171,54 @@ def generate_response(self, text, spans): p.add_argument("--ner-model", default="ner-fast") p.add_argument("--bind", "-b", metavar="ADDRESS", default="0.0.0.0") p.add_argument("--port", "-p", default=5555, type=int) + p.add_argument("--use_bert_large_cased", help = "use Bert large cased rather than Flair", action="store_true") + p.add_argument("--use_bert_base_cased", help = "use Bert base cased rather than Flair", action="store_true") + p.add_argument("--use_bert_large_uncased", help = "use Bert large uncased rather than Flair", action="store_true") + p.add_argument("--use_bert_base_uncased", help = "use Bert base uncased rather than Flair", action="store_true") + p.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") + p.add_argument("--split_docs_value", help = "threshold number of tokens to split document") + args = p.parse_args() - ner_model = load_flair_ner(args.ner_model) + use_bert_base_cased = False + use_bert_large_cased = False + use_bert_base_uncased = False + use_bert_large_uncased = False + + if args.use_bert_base_cased: + ner_model = load_bert_ner("dslim/bert-base-NER") + use_bert_base_cased = True + elif args.use_bert_large_cased: + ner_model = load_bert_ner("dslim/bert-large-NER") + use_bert_large_cased = True + elif args.use_bert_base_uncased: + ner_model = load_bert_ner("dslim/bert-base-NER-uncased") + use_bert_base_uncased = True + elif args.use_bert_large_uncased: + ner_model = load_bert_ner("Jorgeutd/bert-large-uncased-finetuned-ner") + use_bert_large_uncased = True + else: + ner_model = load_flair_ner(args.ner_model) + + split_docs_value = 0 + if args.split_docs_value: + split_docs_value = int(args.split_docs_value) + + process_sentences = args.process_sentences + ed_model = EntityDisambiguation( args.base_url, args.wiki_version, {"mode": "eval", "model_path": args.ed_model} ) server_address = (args.bind, args.port) server = HTTPServer( server_address, - make_handler(args.base_url, args.wiki_version, ed_model, ner_model), + make_handler(args.base_url, + args.wiki_version, + ed_model, + ner_model, + (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased), + process_sentences, + split_docs_value) ) try: From 5f850dca638e96e4e5449b5c3496155a94b7e474 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 15 Dec 2022 18:10:02 +0000 Subject: [PATCH 03/61] efficiency test without server --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 376b3a6..b5fb565 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,17 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/radboud-el)](https://pypi.org/project/radboud-el/) [![PyPI](https://img.shields.io/pypi/v/radboud-el.svg?style=flat)](https://pypi.org/project/radboud-el/) +--- + +Example tests: + +* Flair: `python3 scripts/efficiency\_test.py --process\_sentences` +* Bert: `python3 scripts/efficiency\_test.py --use\_bert\_base\_cased --split\_docs\_value 500` + +Need installation of REL documents in directory `doc` (`ed-wiki-2019`, `generic` and `wiki_2019`) + +--- + REL is a modular Entity Linking package that is provided as a Python package as well as a web API. REL has various meanings - one might first notice that it stands for relation, which is a suiting name for the problems that can be tackled with this package. Additionally, in Dutch a 'rel' means a disturbance of the public order, which is exactly what we aim to achieve with the release of this package. REL utilizes *English* Wikipedia as a knowledge base and can be used for the following tasks: From ca1b9373adbd6d38481e1d2942c75069e858250c Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 15 Dec 2022 18:10:22 +0000 Subject: [PATCH 04/61] efficiency test without server --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b5fb565..26a816a 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ Example tests: Need installation of REL documents in directory `doc` (`ed-wiki-2019`, `generic` and `wiki_2019`) +Server does not work yet + --- REL is a modular Entity Linking package that is provided as a Python package as well as a web API. REL has various meanings - one might first notice that it stands for relation, which is a suiting name for the problems that can be tackled with this package. Additionally, in Dutch a 'rel' means a disturbance of the public order, which is exactly what we aim to achieve with the release of this package. From 234e88606bf242776fe2a4e54fae04e1526554bb Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 15 Dec 2022 18:12:49 +0000 Subject: [PATCH 05/61] efficiency test without server --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 26a816a..f5a84b6 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ Example tests: -* Flair: `python3 scripts/efficiency\_test.py --process\_sentences` -* Bert: `python3 scripts/efficiency\_test.py --use\_bert\_base\_cased --split\_docs\_value 500` +* Flair: `python3 scripts/efficiency_test.py --process_sentences` +* Bert: `python3 scripts/efficiency_test.py --use_bert_base_cased --split_docs_value 500` Need installation of REL documents in directory `doc` (`ed-wiki-2019`, `generic` and `wiki_2019`) From f8f3d7efae1e3b64f934f37bf672da62f1913419 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 16 Dec 2022 09:43:12 +0000 Subject: [PATCH 06/61] fixed bert server usage --- README.md | 9 +++++---- src/REL/server.py | 14 +++++++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f5a84b6..ea48748 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,12 @@ Example tests: * Flair: `python3 scripts/efficiency_test.py --process_sentences` -* Bert: `python3 scripts/efficiency_test.py --use_bert_base_cased --split_docs_value 500` +* Bert: `python3 scripts/efficiency_test.py --use_bert_base_uncased --split_docs_value 500` +* Server (slower): + * `python3 src/REL/server.py --use_bert_base_uncased --split_docs_value 500 --ed-model ed-wiki-2019 data wiki_2019` + * `python3 scripts/efficiency_test.py --use_server` -Need installation of REL documents in directory `doc` (`ed-wiki-2019`, `generic` and `wiki_2019`) - -Server does not work yet +Needs installation of REL documents in directory `doc` (`ed-wiki-2019`, `generic` and `wiki_2019`) --- diff --git a/src/REL/server.py b/src/REL/server.py index 8af18a5..476918b 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -1,4 +1,5 @@ import json +import numpy from http.server import BaseHTTPRequestHandler from flair.models import SequenceTagger @@ -55,6 +56,17 @@ def do_HEAD(self): self.wfile.write(bytes(json.dumps([]), "utf-8")) return + def solve_floats(self, data): + data_new = [] + for data_set in data: + data_set_new_list = [] + for data_el in data_set: + if isinstance(data_el, numpy.float32): + data_el = float(data_el) + data_set_new_list.append(data_el) + data_new.append(data_set_new_list) + return data_new + def do_POST(self): """ Returns response. @@ -70,7 +82,7 @@ def do_POST(self): text, spans = self.read_json(post_data) response = self.generate_response(text, spans) - self.wfile.write(bytes(json.dumps(response), "utf-8")) + self.wfile.write(bytes(json.dumps(self.solve_floats(response)), "utf-8")) except Exception as e: print(f"Encountered exception: {repr(e)}") self.send_response(400) From f3914e97cd271be3e092aa247dc321268d7f0d40 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 16 Dec 2022 10:36:06 +0000 Subject: [PATCH 07/61] fixed gerbil test problem --- scripts/gerbil_middleware/Makefile | 6 +++--- scripts/gerbil_middleware/pom.xml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/gerbil_middleware/Makefile b/scripts/gerbil_middleware/Makefile index 519a9f7..e899701 100644 --- a/scripts/gerbil_middleware/Makefile +++ b/scripts/gerbil_middleware/Makefile @@ -1,10 +1,10 @@ default: build dockerize build: - mvn clean package -U + mvn clean package -U dockerize: - docker build -t git.project-hobbit.eu:4567/gerbil/spotwrapnifws4test . + docker build -t git.project-hobbit.eu:4567/gerbil/spotwrapnifws4test . push: - docker push git.project-hobbit.eu:4567/gerbil/spotwrapnifws4test \ No newline at end of file + docker push git.project-hobbit.eu:4567/gerbil/spotwrapnifws4test diff --git a/scripts/gerbil_middleware/pom.xml b/scripts/gerbil_middleware/pom.xml index 97e8aa0..af5da3e 100644 --- a/scripts/gerbil_middleware/pom.xml +++ b/scripts/gerbil_middleware/pom.xml @@ -76,7 +76,7 @@ org.apache.jena jena-core - 4.2.0 + 2.11.1 org.apache.jena From 84f28d7cdfa0e4d354cd0b9de5cb7f0e76584a2e Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 20 Dec 2022 13:33:23 +0000 Subject: [PATCH 08/61] added multilingual bert --- scripts/efficiency_test.py | 10 ++++++++-- src/REL/server.py | 7 ++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 0f8dc5e..3486d7b 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -14,6 +14,7 @@ parser.add_argument("--use_bert_large_cased", help = "use Bert large cased rather than Flair", action="store_true") parser.add_argument("--use_bert_base_uncased", help = "use Bert base uncased rather than Flair", action="store_true") parser.add_argument("--use_bert_large_uncased", help = "use Bert large uncased rather than Flair", action="store_true") +parser.add_argument("--use_bert_multilingual", help = "use Bert multilingual rather than Flair", action="store_true") parser.add_argument("--use_server", help = "use server", action="store_true") parser.add_argument("--wiki_version", help = "Wiki version") args = parser.parse_args() @@ -51,6 +52,7 @@ use_bert_large_cased = False use_bert_base_uncased = False use_bert_large_uncased = False +use_bert_multilingual = False if args.use_bert_base_cased: use_bert_base_cased = True @@ -60,8 +62,10 @@ use_bert_base_uncased = True elif args.use_bert_large_uncased: use_bert_large_uncased = True +elif args.use_bert_multilingual: + use_bert_multilingual = True -print(f"max_docs={max_docs} wiki_version={wiki_version} use_bert_base_cased={use_bert_base_cased} use_bert_large_cased={use_bert_large_cased} use_bert_base_uncased={use_bert_base_uncased} use_bert_large_uncased={use_bert_large_uncased} use_server={use_server} process_sentences={process_sentences} split_docs_value={split_docs_value}") +print(f"max_docs={max_docs} wiki_version={wiki_version} use_bert_base_cased={use_bert_base_cased} use_bert_large_cased={use_bert_large_cased} use_bert_base_uncased={use_bert_base_uncased} use_bert_large_uncased={use_bert_large_uncased} use_bert_multilingual={use_bert_multilingual} use_server={use_server} process_sentences={process_sentences} split_docs_value={split_docs_value}") docs = {} all_results = {} @@ -138,6 +142,8 @@ tagger_ner = load_bert_ner("dslim/bert-base-NER") elif use_bert_large_cased: tagger_ner = load_bert_ner("dslim/bert-large-NER") + elif use_bert_multilingual: + tagger_ner = load_bert_ner("Davlan/bert-base-multilingual-cased-ner-hrl") else: tagger_ner = SequenceTagger.load("ner-fast") @@ -145,7 +151,7 @@ # mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner) mentions_dataset, n_mentions = mention_detection.find_mentions( docs, - (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased), + (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased or use_bert_multilingual), process_sentences, split_docs_value, tagger_ner) diff --git a/src/REL/server.py b/src/REL/server.py index 476918b..b5614f3 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -187,6 +187,7 @@ def generate_response(self, text, spans): p.add_argument("--use_bert_base_cased", help = "use Bert base cased rather than Flair", action="store_true") p.add_argument("--use_bert_large_uncased", help = "use Bert large uncased rather than Flair", action="store_true") p.add_argument("--use_bert_base_uncased", help = "use Bert base uncased rather than Flair", action="store_true") + p.add_argument("--use_bert_multilingual", help = "use Bert multilingual rather than Flair", action="store_true") p.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") p.add_argument("--split_docs_value", help = "threshold number of tokens to split document") @@ -196,6 +197,7 @@ def generate_response(self, text, spans): use_bert_large_cased = False use_bert_base_uncased = False use_bert_large_uncased = False + use_bert_multilingual = False if args.use_bert_base_cased: ner_model = load_bert_ner("dslim/bert-base-NER") @@ -209,6 +211,9 @@ def generate_response(self, text, spans): elif args.use_bert_large_uncased: ner_model = load_bert_ner("Jorgeutd/bert-large-uncased-finetuned-ner") use_bert_large_uncased = True + elif args.use_bert_multilingual: + ner_model = load_bert_ner("Davlan/bert-base-multilingual-cased-ner-hrl") + use_bert_multilingual = True else: ner_model = load_flair_ner(args.ner_model) @@ -228,7 +233,7 @@ def generate_response(self, text, spans): args.wiki_version, ed_model, ner_model, - (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased), + (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased or use_bert_multilingual), process_sentences, split_docs_value) ) From 67696a940601f1b68177a8de105408b19682428a Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 22 Dec 2022 17:23:59 +0000 Subject: [PATCH 09/61] refactored code --- src/REL/evaluate_predictions.py | 138 +++++++++++++++++++++++++++++ tests/test_bert_md.py | 38 ++++++++ tests/test_evaluate_predictions.py | 34 +++++++ 3 files changed, 210 insertions(+) create mode 100644 src/REL/evaluate_predictions.py create mode 100644 tests/test_bert_md.py create mode 100644 tests/test_evaluate_predictions.py diff --git a/src/REL/evaluate_predictions.py b/src/REL/evaluate_predictions.py new file mode 100644 index 0000000..8e460a6 --- /dev/null +++ b/src/REL/evaluate_predictions.py @@ -0,0 +1,138 @@ +import re + + +UNUSED = -1 + + +def get_gold_data(doc): + GOLD_DATA_FILE = "./data/generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv" + entities = [] + + in_file = open(GOLD_DATA_FILE, "r") + for line in in_file: + if re.search(f"^-DOCSTART- \({doc} ", line): + break + for line in in_file: + if re.search(f"^-DOCSTART- ", line): + break + fields = line.strip().split("\t") + if len(fields) > 3: + if fields[1] == "B": + entities.append([fields[2], fields[3]]) + return entities + + +def md_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i): + return gold_entities[gold_i][0].lower() == predicted_entities[predicted_i][0].lower() + + +def el_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i): + return(gold_entities[gold_i][0].lower() == predicted_entities[predicted_i][0].lower() and + gold_entities[gold_i][1].lower() == predicted_entities[predicted_i][1].lower()) + + +def find_correct_els(gold_entities, predicted_entities, gold_links, predicted_links): + for gold_i in range(0, len(gold_entities)): + if gold_links[gold_i] == UNUSED: + for predicted_i in range(0, len(predicted_entities)): + if (predicted_links[predicted_i] == UNUSED and + el_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i)): + gold_links[gold_i] = predicted_i + predicted_links[predicted_i] = gold_i + return gold_links, predicted_links + + +def find_correct_mds(gold_entities, predicted_entities, gold_links, predicted_links): + for gold_i in range(0, len(gold_entities)): + if gold_links[gold_i] == UNUSED: + for predicted_i in range(0, len(predicted_entities)): + if (predicted_links[predicted_i] == UNUSED and + md_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i)): + gold_links[gold_i] = predicted_i + predicted_links[predicted_i] = gold_i + return gold_links, predicted_links + + + +def compare_entities(gold_entities, predicted_entities): + gold_links = len(gold_entities) * [UNUSED] + predicted_links = len(predicted_entities) * [UNUSED] + gold_links, predicted_links = find_correct_els(gold_entities, predicted_entities, gold_links, predicted_links) + gold_links, predicted_links = find_correct_mds(gold_entities, predicted_entities, gold_links, predicted_links) + return gold_links, predicted_links + + +def count_entities(gold_entities, predicted_entities, gold_links, predicted_links): + correct = 0 + wrong_md = 0 + wrong_el = 0 + missed = 0 + for predicted_i in range(0, len(predicted_links)): + if predicted_links[predicted_i] == UNUSED: + wrong_md += 1 + elif predicted_entities[predicted_i][1] == gold_entities[predicted_links[predicted_i]][1]: + correct += 1 + else: + wrong_el += 1 + for gold_i in range(0, len(gold_links)): + if gold_links[gold_i] == UNUSED: + missed += 1 + return correct, wrong_md, wrong_el, missed + + +def compare_and_count_entities(gold_entities, predicted_entities): + gold_links, predicted_links = compare_entities(gold_entities, predicted_entities) + return count_entities(gold_entities, predicted_entities, gold_links, predicted_links) + + +def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + if correct_all + wrong_el_all > 0: + precision_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + wrong_md_all) + recall_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + missed_all) + f1_md = 2 * precision_md * recall_md / ( precision_md + recall_md ) + else: + precision_md = 0 + recall_md = 0 + f1_md = 0 + return precision_md, recall_md, f1_md + + +def compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + if correct_all > 0: + precision_el = 100 * correct_all / (correct_all + wrong_md_all + wrong_el_all) + recall_el = 100 * correct_all / (correct_all + wrong_el_all + missed_all) + f1_el = 2 * precision_el * recall_el / ( precision_el + recall_el ) + else: + precision_el = 0.0 + recall_el = 0 + f1_el = 0 + return precision_el, recall_el, f1_el + + +def print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + precision_md, recall_md, f1_md = compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) + precision_el, recall_el, f1_el = compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) + print("Results: PMD RMD FMD PEL REL FEL: ", end="") + print(f"{precision_md:0.1f}% {recall_md:0.1f}% {f1_md:0.1f}% | ",end="") + print(f"{precision_el:0.1f}% {recall_el:0.1f}% {f1_el:0.1f}%") + return precision_md, recall_md, f1_md, precision_el, recall_el, f1_el + + +def evaluate(predictions): + correct_all = 0 + wrong_md_all = 0 + wrong_el_all = 0 + missed_all = 0 + for doc in predictions: + gold_entities = get_gold_data(doc) + predicted_entities = [] + for mention in predictions[doc]: + predicted_entities.append([mention["mention"], mention["prediction"]]) + #print("GOLD", gold_entities) + #print("PREDICTED", predicted_entities) + correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) + correct_all += correct + wrong_md_all += wrong_md + wrong_el_all += wrong_el + missed_all += missed + print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py new file mode 100644 index 0000000..e7dad23 --- /dev/null +++ b/tests/test_bert_md.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from pathlib import Path + +from transformers import AutoTokenizer, AutoModelForTokenClassification +from transformers import pipeline + +from REL.mention_detection import MentionDetection +from REL.ner.bert_wrapper import load_bert_ner + + +def test_md(): + ner_model = load_bert_ner("dslim/bert-base-NER") + + md = MentionDetection(Path(__file__).parent, "wiki_test") + + # first test case: repeating sentences + sample1 = {"test_doc": ["Fox, Fox. Fox.", []]} + resulting_spans1 = {(0, 3), (5, 3), (10, 3)} + predictions = md.find_mentions(sample1, ner_model) + predicted_spans = [] + for i in range(0, 1): + p = { + (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[i]["test_doc"] + } + predicted_spans.extend(list(p)) + predicted_spans = set(predicted_spans) + assert resulting_spans1 == predicted_spans + + # second test case: excessive whitespace + sample2 = {"test_doc": ["Fox, Fox, Fox.", []]} + resulting_spans2 = {(0, 3), (20, 3), (43, 3)} + predictions = md.find_mentions(sample2, ner_model) + predicted_spans = { + (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] + } + assert resulting_spans2 == predicted_spans diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py new file mode 100644 index 0000000..c869831 --- /dev/null +++ b/tests/test_evaluate_predictions.py @@ -0,0 +1,34 @@ +from REL.evaluate_predictions import compare_and_count_entities, print_scores + + +def test_perfect(): + gold_entities = [ [ "1", "1" ] ] + predicted_entities = [ [ "1", "1" ] ] + correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) + precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) + assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [100, 100, 100, 100, 100, 100], "should be perfect MD and perfect EL" + + +def test_el_wrong(): + gold_entities = [ [ "1", "1" ] ] + predicted_entities = [ [ "1", "0" ] ] + correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) + precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) + assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [100, 100, 100, 0, 0, 0], "should be perfect MD and failed EL" + + +def test_md_wrong(): + gold_entities = [ [ "1", "1" ] ] + predicted_entities = [ [ "0", "1" ] ] + correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) + precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) + assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [0, 0, 0, 0, 0, 0], "should be failed MD and failed EL" + + +def test_combined(): + gold_entities = [ [ "1", "1" ], [ "1", "1" ], [ "2", "2" ] ] + predicted_entities = [ [ "0", "0" ], [ "0", "1" ], [ "1", "0" ], [ "1", "1" ] ] + correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) + precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) + assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [100/2, 100*2/3, 100*4/7, 100/4, 100/3, 100*2/7], "should be various scores" + From 0af9d194945acecbe829b6ab1154bc972eb110c3 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 23 Dec 2022 17:57:26 +0000 Subject: [PATCH 10/61] refactored code --- scripts/efficiency_test.py | 78 +++++------------- scripts/evaluate_predictions.py | 138 -------------------------------- src/REL/mention_detection.py | 13 ++- src/REL/ner/set_tagger_ner.py | 38 +++++++++ src/REL/server.py | 42 ++-------- tests/test_bert_md.py | 14 ++-- tests/test_flair_md.py | 16 ++-- 7 files changed, 95 insertions(+), 244 deletions(-) delete mode 100644 scripts/evaluate_predictions.py create mode 100644 src/REL/ner/set_tagger_ner.py diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 3486d7b..27175e0 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -1,9 +1,11 @@ import argparse -import evaluate_predictions +from REL.evaluate_predictions import evaluate import json import numpy as np +import re import requests +from REL.ner.set_tagger_ner import set_tagger_ner from REL.training_datasets import TrainingEvaluationDatasets parser = argparse.ArgumentParser() @@ -21,21 +23,19 @@ np.random.seed(seed=42) -base_url = "/store/userdata/etjong/REL-erik/data/" -if args.max_docs: - max_docs = int(args.max_docs) -else: - max_docs = 50 -if args.process_sentences: - process_sentences = True -else: - process_sentences = False +base_url = "/store/userdata/etjong/REL/data/" +process_sentences = args.process_sentences if args.split_docs_value: split_docs_value = int(args.split_docs_value) else: split_docs_value = 0 +if args.max_docs: + max_docs = int(args.max_docs) +else: + max_docs = 50 + if args.wiki_version: wiki_version = args.wiki_version else: @@ -43,29 +43,10 @@ datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()["aida_testB"] -if args.use_server: - use_server = True -else: - use_server = False - -use_bert_base_cased = False -use_bert_large_cased = False -use_bert_base_uncased = False -use_bert_large_uncased = False -use_bert_multilingual = False - -if args.use_bert_base_cased: - use_bert_base_cased = True -elif args.use_bert_large_cased: - use_bert_large_cased = True -elif args.use_bert_base_uncased: - use_bert_base_uncased = True -elif args.use_bert_large_uncased: - use_bert_large_uncased = True -elif args.use_bert_multilingual: - use_bert_multilingual = True - -print(f"max_docs={max_docs} wiki_version={wiki_version} use_bert_base_cased={use_bert_base_cased} use_bert_large_cased={use_bert_large_cased} use_bert_base_uncased={use_bert_base_uncased} use_bert_large_uncased={use_bert_large_uncased} use_bert_multilingual={use_bert_multilingual} use_server={use_server} process_sentences={process_sentences} split_docs_value={split_docs_value}") +use_server = args.use_server +tagger_ner, tagger_ner_name = set_tagger_ner(args.use_bert_base_cased, args.use_bert_base_uncased, args.use_bert_large_cased, args.use_bert_large_uncased, args.use_bert_multilingual) + +print(f"max_docs={max_docs} wiki_version={wiki_version} tagger_ner_name={tagger_ner_name} process_sentences={process_sentences} split_docs_value={split_docs_value}") docs = {} all_results = {} @@ -99,12 +80,10 @@ print("Output API:") results = requests.post("http://0.0.0.0:5555", json=myjson) + print(results.json()) print("----------------------------") - print("results", type(results), results) - print("results.json()", results.json()) try: results_list = [] - print("eff_test", results.json()) for result in results.json(): results_list.append({ "mention": result[2], "prediction": result[3] }) # Flair + Bert all_results[doc] = results_list @@ -113,7 +92,7 @@ all_results[doc] = [] if len(all_results) > 0: - evaluate_predictions.evaluate(all_results) + evaluate(all_results) # --------------------- Now total -------------------------------- # ------------- RUN SEPARATELY TO BALANCE LOAD-------------------- @@ -133,28 +112,8 @@ mention_detection = MentionDetection(base_url, wiki_version) - # Alternatively use Flair NER tagger. - if use_bert_base_uncased: - tagger_ner = load_bert_ner("dslim/bert-base-NER-uncased") - elif use_bert_large_uncased: - tagger_ner = load_bert_ner("Jorgeutd/bert-large-uncased-finetuned-ner") - elif use_bert_base_cased: - tagger_ner = load_bert_ner("dslim/bert-base-NER") - elif use_bert_large_cased: - tagger_ner = load_bert_ner("dslim/bert-large-NER") - elif use_bert_multilingual: - tagger_ner = load_bert_ner("Davlan/bert-base-multilingual-cased-ner-hrl") - else: - tagger_ner = SequenceTagger.load("ner-fast") - start = time() -# mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner) - mentions_dataset, n_mentions = mention_detection.find_mentions( - docs, - (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased or use_bert_multilingual), - process_sentences, - split_docs_value, - tagger_ner) + mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) print("MD took: {} seconds".format(round(time() - start, 2))) # 3. Load ED model. @@ -169,4 +128,5 @@ predictions, timing = ed_model.predict(mentions_dataset) print("ED took: {} seconds".format(round(time() - start, 2))) - evaluate_predictions.evaluate(predictions) + evaluate(predictions) + diff --git a/scripts/evaluate_predictions.py b/scripts/evaluate_predictions.py deleted file mode 100644 index 8e460a6..0000000 --- a/scripts/evaluate_predictions.py +++ /dev/null @@ -1,138 +0,0 @@ -import re - - -UNUSED = -1 - - -def get_gold_data(doc): - GOLD_DATA_FILE = "./data/generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv" - entities = [] - - in_file = open(GOLD_DATA_FILE, "r") - for line in in_file: - if re.search(f"^-DOCSTART- \({doc} ", line): - break - for line in in_file: - if re.search(f"^-DOCSTART- ", line): - break - fields = line.strip().split("\t") - if len(fields) > 3: - if fields[1] == "B": - entities.append([fields[2], fields[3]]) - return entities - - -def md_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i): - return gold_entities[gold_i][0].lower() == predicted_entities[predicted_i][0].lower() - - -def el_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i): - return(gold_entities[gold_i][0].lower() == predicted_entities[predicted_i][0].lower() and - gold_entities[gold_i][1].lower() == predicted_entities[predicted_i][1].lower()) - - -def find_correct_els(gold_entities, predicted_entities, gold_links, predicted_links): - for gold_i in range(0, len(gold_entities)): - if gold_links[gold_i] == UNUSED: - for predicted_i in range(0, len(predicted_entities)): - if (predicted_links[predicted_i] == UNUSED and - el_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i)): - gold_links[gold_i] = predicted_i - predicted_links[predicted_i] = gold_i - return gold_links, predicted_links - - -def find_correct_mds(gold_entities, predicted_entities, gold_links, predicted_links): - for gold_i in range(0, len(gold_entities)): - if gold_links[gold_i] == UNUSED: - for predicted_i in range(0, len(predicted_entities)): - if (predicted_links[predicted_i] == UNUSED and - md_match(gold_entities, predicted_entities, predicted_links, gold_i, predicted_i)): - gold_links[gold_i] = predicted_i - predicted_links[predicted_i] = gold_i - return gold_links, predicted_links - - - -def compare_entities(gold_entities, predicted_entities): - gold_links = len(gold_entities) * [UNUSED] - predicted_links = len(predicted_entities) * [UNUSED] - gold_links, predicted_links = find_correct_els(gold_entities, predicted_entities, gold_links, predicted_links) - gold_links, predicted_links = find_correct_mds(gold_entities, predicted_entities, gold_links, predicted_links) - return gold_links, predicted_links - - -def count_entities(gold_entities, predicted_entities, gold_links, predicted_links): - correct = 0 - wrong_md = 0 - wrong_el = 0 - missed = 0 - for predicted_i in range(0, len(predicted_links)): - if predicted_links[predicted_i] == UNUSED: - wrong_md += 1 - elif predicted_entities[predicted_i][1] == gold_entities[predicted_links[predicted_i]][1]: - correct += 1 - else: - wrong_el += 1 - for gold_i in range(0, len(gold_links)): - if gold_links[gold_i] == UNUSED: - missed += 1 - return correct, wrong_md, wrong_el, missed - - -def compare_and_count_entities(gold_entities, predicted_entities): - gold_links, predicted_links = compare_entities(gold_entities, predicted_entities) - return count_entities(gold_entities, predicted_entities, gold_links, predicted_links) - - -def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): - if correct_all + wrong_el_all > 0: - precision_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + wrong_md_all) - recall_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + missed_all) - f1_md = 2 * precision_md * recall_md / ( precision_md + recall_md ) - else: - precision_md = 0 - recall_md = 0 - f1_md = 0 - return precision_md, recall_md, f1_md - - -def compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): - if correct_all > 0: - precision_el = 100 * correct_all / (correct_all + wrong_md_all + wrong_el_all) - recall_el = 100 * correct_all / (correct_all + wrong_el_all + missed_all) - f1_el = 2 * precision_el * recall_el / ( precision_el + recall_el ) - else: - precision_el = 0.0 - recall_el = 0 - f1_el = 0 - return precision_el, recall_el, f1_el - - -def print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): - precision_md, recall_md, f1_md = compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) - precision_el, recall_el, f1_el = compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) - print("Results: PMD RMD FMD PEL REL FEL: ", end="") - print(f"{precision_md:0.1f}% {recall_md:0.1f}% {f1_md:0.1f}% | ",end="") - print(f"{precision_el:0.1f}% {recall_el:0.1f}% {f1_el:0.1f}%") - return precision_md, recall_md, f1_md, precision_el, recall_el, f1_el - - -def evaluate(predictions): - correct_all = 0 - wrong_md_all = 0 - wrong_el_all = 0 - missed_all = 0 - for doc in predictions: - gold_entities = get_gold_data(doc) - predicted_entities = [] - for mention in predictions[doc]: - predicted_entities.append([mention["mention"], mention["prediction"]]) - #print("GOLD", gold_entities) - #print("PREDICTED", predicted_entities) - correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) - correct_all += correct - wrong_md_all += wrong_md - wrong_el_all += wrong_el - missed_all += missed - print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index b9dcd91..95ce285 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -92,6 +92,13 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger=None text, spans = dataset[doc] if process_sentences: sentences = self.split_single(text) + if split_docs_value > 0: + sentences_split = [] + for sentence in sentences: + split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger) + sentences_split.extend(split_sentences) + print("sentences_split", len(sentences_split) != len(sentences), len(sentences_split), len(sentences)) + sentences = sentences_split elif split_docs_value > 0: sentences = self.split_text_in_parts(text, split_docs_value, tagger) else: @@ -127,7 +134,7 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger=None def combine_entities(self, ner_results): ner_results_out = [] i = 0 - while i < len(ner_results)-1: + while i < len(ner_results): last_end = ner_results[i]["end"] ner_results_out.append(dict(ner_results[i])) j = 1 @@ -198,7 +205,7 @@ def split_text_in_parts(self, text, split_docs_value, tagger): return texts - def find_mentions(self, dataset, use_bert, process_sentences, split_docs_value, tagger=None): + def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_value, tagger=None): """ Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, it returns the mention, its left/right context and a set of candidates. @@ -242,7 +249,7 @@ def find_mentions(self, dataset, use_bert, process_sentences, split_docs_value, if is_flair else self.combine_entities(tagger(snt)) ): - if use_bert: + if re.search("bert", tagger_ner_name): text, start_pos, end_pos, conf, tag = ( sentence[entity["start"]:entity["end"]], # for BERT entity["start"], diff --git a/src/REL/ner/set_tagger_ner.py b/src/REL/ner/set_tagger_ner.py new file mode 100644 index 0000000..23edc4a --- /dev/null +++ b/src/REL/ner/set_tagger_ner.py @@ -0,0 +1,38 @@ +import re + +from flair.models import SequenceTagger +from REL.ner.bert_wrapper import load_bert_ner + + +taggers_ner = { + "flair": "ner-fast", + "bert_base_cased": "dslim/bert-base-NER", + "bert_base_uncased": "dslim/bert-base-NER-uncased", + "bert_large_cased": "dslim/bert-large-NER", + "bert_large_uncased": "Jorgeutd/bert-large-uncased-finetuned-ner", + "bert_multilingual": "Davlan/bert-base-multilingual-cased-ner-hrl" +} + + +def set_tagger_ner(use_bert_base_cased, use_bert_base_uncased, use_bert_large_cased, use_bert_large_uncased, use_bert_multilingual): + if use_bert_base_cased: + tagger_ner_name = "bert_base_cased" + elif use_bert_base_uncased: + tagger_ner_name = "bert_base_uncased" + elif use_bert_large_cased: + tagger_ner_name = "bert_large_cased" + elif use_bert_large_uncased: + tagger_ner_name = "bert_large_uncased" + elif use_bert_multilingual: + tagger_ner_name = "bert_multilingual" + else: + tagger_ner_name = "flair" + + if re.search("flair", tagger_ner_name): + tagger_ner = SequenceTagger.load("ner-fast") + elif re.search("bert", tagger_ner_name): + tagger_ner = load_bert_ner(taggers_ner[tagger_ner_name]) + else: + raise Exception(f"unknown tagger name: {tagger_ner_name}") + + return tagger_ner, tagger_ner_name diff --git a/src/REL/server.py b/src/REL/server.py index b5614f3..cb3c826 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -1,17 +1,19 @@ import json import numpy +import re from http.server import BaseHTTPRequestHandler from flair.models import SequenceTagger from REL.mention_detection import MentionDetection from REL.utils import process_results +from REL.ner.set_tagger_ner import set_tagger_ner API_DOC = "API_DOC" -def make_handler(base_url, wiki_version, ed_model, tagger_ner, use_bert, process_sentences, split_docs_value=0): +def make_handler(base_url, wiki_version, ed_model, tagger_ner, tagger_ner_name, process_sentences, split_docs_value=0): """ Class/function combination that is used to setup an API that can be used for e.g. GERBIL evaluation. """ @@ -19,7 +21,7 @@ class GetHandler(BaseHTTPRequestHandler): def __init__(self, *args, **kwargs): self.ed_model = ed_model self.tagger_ner = tagger_ner - self.use_bert = use_bert + self.tagger_ner_name = tagger_ner_name self.process_sentences = process_sentences self.split_docs_value = split_docs_value @@ -144,7 +146,7 @@ def generate_response(self, text, spans): # EL processed = {API_DOC: [text, spans]} mentions_dataset, total_ment = self.mention_detection.find_mentions( - processed, self.use_bert, self.process_sentences, self.split_docs_value, self.tagger_ner + processed, self.tagger_ner_name, self.process_sentences, self.split_docs_value, self.tagger_ner ) # Disambiguation @@ -173,7 +175,7 @@ def generate_response(self, text, spans): from http.server import HTTPServer from REL.entity_disambiguation import EntityDisambiguation - from REL.ner import load_flair_ner + from REL.ner.flair_wrapper import load_flair_ner from REL.ner.bert_wrapper import load_bert_ner p = argparse.ArgumentParser() @@ -193,29 +195,7 @@ def generate_response(self, text, spans): args = p.parse_args() - use_bert_base_cased = False - use_bert_large_cased = False - use_bert_base_uncased = False - use_bert_large_uncased = False - use_bert_multilingual = False - - if args.use_bert_base_cased: - ner_model = load_bert_ner("dslim/bert-base-NER") - use_bert_base_cased = True - elif args.use_bert_large_cased: - ner_model = load_bert_ner("dslim/bert-large-NER") - use_bert_large_cased = True - elif args.use_bert_base_uncased: - ner_model = load_bert_ner("dslim/bert-base-NER-uncased") - use_bert_base_uncased = True - elif args.use_bert_large_uncased: - ner_model = load_bert_ner("Jorgeutd/bert-large-uncased-finetuned-ner") - use_bert_large_uncased = True - elif args.use_bert_multilingual: - ner_model = load_bert_ner("Davlan/bert-base-multilingual-cased-ner-hrl") - use_bert_multilingual = True - else: - ner_model = load_flair_ner(args.ner_model) + tagger_ner, tagger_ner_name = set_tagger_ner(args.use_bert_base_cased, args.use_bert_base_uncased, args.use_bert_large_cased, args.use_bert_large_uncased, args.use_bert_multilingual) split_docs_value = 0 if args.split_docs_value: @@ -229,13 +209,7 @@ def generate_response(self, text, spans): server_address = (args.bind, args.port) server = HTTPServer( server_address, - make_handler(args.base_url, - args.wiki_version, - ed_model, - ner_model, - (use_bert_base_cased or use_bert_large_cased or use_bert_base_uncased or use_bert_large_uncased or use_bert_multilingual), - process_sentences, - split_docs_value) + make_handler(args.base_url, args.wiki_version, ed_model, tagger_ner, tagger_ner_name, process_sentences, split_docs_value) ) try: diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py index e7dad23..1b80075 100644 --- a/tests/test_bert_md.py +++ b/tests/test_bert_md.py @@ -11,14 +11,18 @@ def test_md(): - ner_model = load_bert_ner("dslim/bert-base-NER") - - md = MentionDetection(Path(__file__).parent, "wiki_test") + tagger_ner_name = "bert_base_cased" + tagger_ner = load_bert_ner("dslim/bert-base-NER") + process_sentences = False + split_docs_value = 0 + base_url = "/store/userdata/etjong/REL/data/" + wiki_version = "wiki_2019" + md = MentionDetection(base_url, wiki_version) # first test case: repeating sentences sample1 = {"test_doc": ["Fox, Fox. Fox.", []]} resulting_spans1 = {(0, 3), (5, 3), (10, 3)} - predictions = md.find_mentions(sample1, ner_model) + predictions = md.find_mentions(sample1, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) predicted_spans = [] for i in range(0, 1): p = { @@ -31,7 +35,7 @@ def test_md(): # second test case: excessive whitespace sample2 = {"test_doc": ["Fox, Fox, Fox.", []]} resulting_spans2 = {(0, 3), (20, 3), (43, 3)} - predictions = md.find_mentions(sample2, ner_model) + predictions = md.find_mentions(sample2, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) predicted_spans = { (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] } diff --git a/tests/test_flair_md.py b/tests/test_flair_md.py index ac7ab62..da16891 100644 --- a/tests/test_flair_md.py +++ b/tests/test_flair_md.py @@ -10,13 +10,19 @@ def test_md(): # return standard Flair tagger + mention detection object - tagger = SequenceTagger.load("ner-fast") - md = MentionDetection(Path(__file__).parent, "wiki_test") + tagger_ner_name = "flair" + tagger_ner = SequenceTagger.load("ner-fast") + process_sentences = True + split_docs_value = 0 + base_url = "/store/userdata/etjong/REL/data/" + wiki_version = "wiki_2019" + md = MentionDetection(base_url, wiki_version) # first test case: repeating sentences - sample1 = {"test_doc": ["Fox, Fox. Fox.", []]} + sample1 = {"test_doc": [ "Fox. Fox. Fox." , []] } resulting_spans1 = {(0, 3), (5, 3), (10, 3)} - predictions = md.find_mentions(sample1, tagger) + predictions = md.find_mentions(sample1, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) + predicted_spans = { (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] } @@ -25,7 +31,7 @@ def test_md(): # second test case: excessive whitespace sample2 = {"test_doc": ["Fox. Fox. Fox.", []]} resulting_spans2 = {(0, 3), (20, 3), (43, 3)} - predictions = md.find_mentions(sample2, tagger) + predictions = md.find_mentions(sample2, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) predicted_spans = { (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] } From 8c182516e78c32c29118a08290557f1cd63de9e2 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 5 Jan 2023 12:31:04 +0000 Subject: [PATCH 11/61] smooth installation updates --- scripts/efficiency_test.py | 3 ++- setup.cfg | 2 ++ tests/test_bert_md.py | 4 +++- tests/test_flair_md.py | 4 +++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 27175e0..ce30733 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -2,6 +2,7 @@ from REL.evaluate_predictions import evaluate import json import numpy as np +import os import re import requests @@ -23,7 +24,7 @@ np.random.seed(seed=42) -base_url = "/store/userdata/etjong/REL/data/" +base_url = os.path.dirname(__file__) + "/../data/" process_sentences = args.process_sentences if args.split_docs_value: diff --git a/setup.cfg b/setup.cfg index 8fbd4af..6eb9f3a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,6 +50,8 @@ install_requires = torch nltk anyascii + termcolor + syntok [options.extras_require] develop = diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py index 1b80075..eb5bc75 100644 --- a/tests/test_bert_md.py +++ b/tests/test_bert_md.py @@ -3,6 +3,8 @@ from pathlib import Path +import os + from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import pipeline @@ -15,7 +17,7 @@ def test_md(): tagger_ner = load_bert_ner("dslim/bert-base-NER") process_sentences = False split_docs_value = 0 - base_url = "/store/userdata/etjong/REL/data/" + base_url = os.path.dirname(__file__) + "/../data/" wiki_version = "wiki_2019" md = MentionDetection(base_url, wiki_version) diff --git a/tests/test_flair_md.py b/tests/test_flair_md.py index da16891..606e03b 100644 --- a/tests/test_flair_md.py +++ b/tests/test_flair_md.py @@ -3,6 +3,8 @@ from pathlib import Path +import os + from flair.models import SequenceTagger from REL.mention_detection import MentionDetection @@ -14,7 +16,7 @@ def test_md(): tagger_ner = SequenceTagger.load("ner-fast") process_sentences = True split_docs_value = 0 - base_url = "/store/userdata/etjong/REL/data/" + base_url = os.path.dirname(__file__) + "/../data/" wiki_version = "wiki_2019" md = MentionDetection(base_url, wiki_version) From a6ae21100fe21c4942f11f4d08cb9aff879f72b7 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 5 Jan 2023 15:07:18 +0100 Subject: [PATCH 12/61] fixed tests/test_ed_pipeline.py --- src/REL/db/base.py | 8 ++++---- src/REL/mention_detection.py | 33 ++++++++++++++++----------------- tests/test_ed_pipeline.py | 6 +++--- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/REL/db/base.py b/src/REL/db/base.py index 01e3d81..8b66db5 100644 --- a/src/REL/db/base.py +++ b/src/REL/db/base.py @@ -184,10 +184,10 @@ def lookup_wik(self, w, table_name, column): "select {} from {} where word = :word".format(column, table_name), {"word": w}, ).fetchone() - res = ( - #e if e is None else json.loads(e[0].decode()) if column == "p_e_m" else e[0] - e if e is None else json.loads("".join(chr(int(x, 2)) for x in e[0].split())) if column == "p_e_m" else e[0] - ) + try: + res = ( e if e is None else json.loads(e[0].decode()) if column == "p_e_m" else e[0] ) + except Exception: + res = ( e if e is None else json.loads("".join(chr(int(x, 2)) for x in e[0].split())) if column == "p_e_m" else e[0] ) return res diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index 95ce285..035960c 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -20,7 +20,7 @@ def __init__(self, base_url, wiki_version): super().__init__(base_url, wiki_version) - def format_spans(self, dataset): + def format_spans(self, dataset, tagger_ner_name, process_sentences, split_docs_value=0, tagger_ner=None): """ Responsible for formatting given spans into dataset for the ED step. More specifically, it returns the mention, its left/right context and a set of candidates. @@ -28,7 +28,7 @@ def format_spans(self, dataset): :return: Dictionary with mentions per document. """ - dataset, _, _ = self.split_text(dataset) + dataset, _, _ = self.split_text(dataset, process_sentences, split_docs_value, tagger_ner, re.search("flair", tagger_ner_name)) results = {} total_ment = 0 @@ -76,7 +76,7 @@ def split_single(self, text): return sentences - def split_text(self, dataset, process_sentences, split_docs_value=0, tagger=None, is_flair=False): + def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner=None, is_flair=False): """ Splits text into sentences with optional spans (format is a requirement for GERBIL usage). This behavior is required for the default NER-tagger, which during experiments was experienced @@ -95,12 +95,11 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger=None if split_docs_value > 0: sentences_split = [] for sentence in sentences: - split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger) + split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger_ner) sentences_split.extend(split_sentences) - print("sentences_split", len(sentences_split) != len(sentences), len(sentences_split), len(sentences)) sentences = sentences_split elif split_docs_value > 0: - sentences = self.split_text_in_parts(text, split_docs_value, tagger) + sentences = self.split_text_in_parts(text, split_docs_value, tagger_ner) else: sentences = [ text ] res[doc] = {} @@ -153,11 +152,11 @@ def combine_entities(self, ner_results): return ner_results_out - def split_sentence_in_bert_tokens(self, sentence, tagger): - tokenizer_results = tagger.tokenizer([sentence], return_offsets_mapping=True) # warns if sentence is too long (>512) + def split_sentence_in_bert_tokens(self, sentence, tagger_ner): + tokenizer_results = tagger_ner.tokenizer([sentence], return_offsets_mapping=True) # warns if sentence is too long (>512) input_ids = tokenizer_results["input_ids"][0] token_spans = tokenizer_results["offset_mapping"][0] - tokens = [ tagger.tokenizer.decode(token_id) for token_id in input_ids ] + tokens = [ tagger_ner.tokenizer.decode(token_id) for token_id in input_ids ] return tokens, token_spans @@ -174,7 +173,7 @@ def combine_tokens_to_text(self, token_list): return text - def split_text_in_parts(self, text, split_docs_value, tagger): + def split_text_in_parts(self, text, split_docs_value, tagger_ner): """ Splits text in parts of as most split_docs_value tokens. Texts are split at sentence boundaries. If a sentence is longer than the limit it will be split in parts of @@ -184,7 +183,7 @@ def split_text_in_parts(self, text, split_docs_value, tagger): token_lists = [] texts = [] for sentence in sentences: - sentence_tokens, token_spans = self.split_sentence_in_bert_tokens(sentence, tagger) + sentence_tokens, token_spans = self.split_sentence_in_bert_tokens(sentence, tagger_ner) if len(token_lists) == 0 or (len(token_lists[-1]) + len(sentence_tokens)) > split_docs_value: token_lists.append([]) texts.append("") @@ -205,25 +204,25 @@ def split_text_in_parts(self, text, split_docs_value, tagger): return texts - def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_value, tagger=None): + def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_value=0, tagger_ner=None): """ Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, it returns the mention, its left/right context and a set of candidates. :return: Dictionary with mentions per document. """ - if tagger is None: + if tagger_ner is None: raise Exception( "No NER tagger is set, but you are attempting to perform Mention Detection.." ) # Verify if Flair, else ngram or custom. - is_flair = isinstance(tagger, SequenceTagger) + is_flair = isinstance(tagger_ner, SequenceTagger) dataset_sentences_raw, processed_sentences, splits = self.split_text( - dataset, process_sentences, split_docs_value, tagger, is_flair + dataset, process_sentences, split_docs_value, tagger_ner, is_flair ) results = {} total_ment = 0 if is_flair: - tagger.predict(processed_sentences) + tagger_ner.predict(processed_sentences) for i, doc in enumerate(dataset_sentences_raw): raw_text = dataset[doc][0] contents = dataset_sentences_raw[doc] @@ -247,7 +246,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ for entity in ( snt.get_spans("ner") if is_flair - else self.combine_entities(tagger(snt)) + else self.combine_entities(tagger_ner(snt)) ): if re.search("bert", tagger_ner_name): text, start_pos, end_pos, conf, tag = ( diff --git a/tests/test_ed_pipeline.py b/tests/test_ed_pipeline.py index b2041e7..06bf11e 100644 --- a/tests/test_ed_pipeline.py +++ b/tests/test_ed_pipeline.py @@ -22,13 +22,13 @@ def test_pipeline(): tagger = Cmns(base_url, wiki_subfolder, n=5) model = EntityDisambiguation(base_url, wiki_subfolder, config) - mentions_dataset, total_mentions = md.format_spans(sample) + mentions_dataset, total_mentions = md.format_spans(sample, tagger_ner_name="flair", process_sentences=True) predictions, _ = model.predict(mentions_dataset) results = process_results( mentions_dataset, predictions, sample, include_offset=False ) - gold_truth = {"test_doc": [(10, 3, "Fox", "fox", -1, "NULL", 0.0)]} + gold_truth = {"test_doc": [(10, 3, "fox", "Fox", 0.0, 0.0, "NULL")]} - return results == gold_truth + assert results == gold_truth From 1dd3a54e4124433dd34a6a4f19a82ae723c095ea Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 5 Jan 2023 16:01:45 +0100 Subject: [PATCH 13/61] made required arguments optional --- src/REL/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/REL/server.py b/src/REL/server.py index cb3c826..ea569e4 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -179,8 +179,8 @@ def generate_response(self, text, spans): from REL.ner.bert_wrapper import load_bert_ner p = argparse.ArgumentParser() - p.add_argument("base_url") - p.add_argument("wiki_version") + p.add_argument("--base_url", default="data") + p.add_argument("--wiki_version", default="wiki_2019") p.add_argument("--ed-model", default="ed-wiki-2019") p.add_argument("--ner-model", default="ner-fast") p.add_argument("--bind", "-b", metavar="ADDRESS", default="0.0.0.0") From e7b16045937ed6b76d1b7d95b0846fd926f1a7e2 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 13 Jan 2023 18:33:19 +0000 Subject: [PATCH 14/61] code cleanup --- src/REL/evaluate_predictions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/REL/evaluate_predictions.py b/src/REL/evaluate_predictions.py index 8e460a6..90727b5 100644 --- a/src/REL/evaluate_predictions.py +++ b/src/REL/evaluate_predictions.py @@ -128,8 +128,6 @@ def evaluate(predictions): predicted_entities = [] for mention in predictions[doc]: predicted_entities.append([mention["mention"], mention["prediction"]]) - #print("GOLD", gold_entities) - #print("PREDICTED", predicted_entities) correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) correct_all += correct wrong_md_all += wrong_md From cd24c2762ed6b57a3039ce27f0bfe05052683564 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 13 Jan 2023 18:35:41 +0000 Subject: [PATCH 15/61] prune word-internal mentions --- src/REL/mention_detection.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index 035960c..a330e4c 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -204,6 +204,20 @@ def split_text_in_parts(self, text, split_docs_value, tagger_ner): return texts + def prune_word_internal_mentions(self, raw_text, result_doc, total_ment): + to_be_deleted = [] + for i in range(0, len(result_doc)): + start_pos = result_doc[i]["pos"] + end_pos = result_doc[i]["end_pos"] + if ((i > 0 and re.search("[a-zA-Z]", raw_text[start_pos-1])) or + (end_pos < len(raw_text) and re.search("[a-zA-Z]", raw_text[end_pos]))): + to_be_deleted.append(i) + total_ment -= len(to_be_deleted) + while len(to_be_deleted) > 0: + result_doc.pop(to_be_deleted.pop(-1)) + return result_doc, total_ment + + def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_value=0, tagger_ner=None): """ Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, @@ -222,7 +236,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ results = {} total_ment = 0 if is_flair: - tagger_ner.predict(processed_sentences) + tagger_ner.predict(processed_sentences) # predict with Flair for i, doc in enumerate(dataset_sentences_raw): raw_text = dataset[doc][0] contents = dataset_sentences_raw[doc] @@ -246,7 +260,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ for entity in ( snt.get_spans("ner") if is_flair - else self.combine_entities(tagger_ner(snt)) + else self.combine_entities(tagger_ner(snt)) # predict with BERT ): if re.search("bert", tagger_ner_name): text, start_pos, end_pos, conf, tag = ( @@ -289,5 +303,6 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ } result_doc.append(res) cum_sent_length += len(sentence) + (offset - cum_sent_length) + result_doc, total_ment = self.prune_word_internal_mentions(raw_text, result_doc, total_ment) results[doc] = result_doc return results, total_ment From f2a551466da9b285041b9f2177b5494cd414cff9 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 13 Jan 2023 18:37:23 +0000 Subject: [PATCH 16/61] solve initials bug --- src/REL/mention_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index a330e4c..fea67b5 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -43,7 +43,7 @@ def format_spans(self, dataset, tagger_ner_name, process_sentences, split_docs_v # end_pos = start_pos + length # ngram = text[start_pos:end_pos] - mention = self.preprocess_mention(ngram) + mention = self.preprocess_mention(ngram) # mention may be different from ngram left_ctxt, right_ctxt = self.get_ctxt( start_pos, end_pos, idx_sent, sentence, sentences_doc ) @@ -279,7 +279,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ entity.tag, ) total_ment += 1 - m = self.preprocess_mention(text) + m = self.preprocess_mention(text) # m may be different from text cands = self.get_candidates(m) if len(cands) == 0: continue @@ -289,7 +289,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ start_pos, end_pos, idx_sent, sentence, sentences_doc ) res = { - "mention": m, + "mention": text, # 20230113 was m "context": (left_ctxt, right_ctxt), "candidates": cands, "gold": ["NONE"], From 0b9030946cc9eff670f87b52dbd57306abdf8563 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Fri, 13 Jan 2023 18:48:04 +0000 Subject: [PATCH 17/61] file path standardization --- scripts/efficiency_test.py | 3 +-- src/REL/server.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index ce30733..b1e2472 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -24,7 +24,7 @@ np.random.seed(seed=42) -base_url = os.path.dirname(__file__) + "/../data/" +base_url = os.path.abspath(os.path.dirname(__file__) + "/../data/") process_sentences = args.process_sentences if args.split_docs_value: @@ -130,4 +130,3 @@ print("ED took: {} seconds".format(round(time() - start, 2))) evaluate(predictions) - diff --git a/src/REL/server.py b/src/REL/server.py index ea569e4..c0c1ca8 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -1,5 +1,6 @@ import json import numpy +import os import re from http.server import BaseHTTPRequestHandler @@ -179,7 +180,7 @@ def generate_response(self, text, spans): from REL.ner.bert_wrapper import load_bert_ner p = argparse.ArgumentParser() - p.add_argument("--base_url", default="data") + p.add_argument("--base_url", default=os.path.abspath(os.path.dirname(__file__) + "/../../data/")) p.add_argument("--wiki_version", default="wiki_2019") p.add_argument("--ed-model", default="ed-wiki-2019") p.add_argument("--ner-model", default="ner-fast") From dc008c0cc6f909d44d420583664292adbdd1fcc0 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 16 Jan 2023 16:57:08 +0000 Subject: [PATCH 18/61] flagged flair with splitting --- src/REL/mention_detection.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index fea67b5..6de019e 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -95,11 +95,11 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner= if split_docs_value > 0: sentences_split = [] for sentence in sentences: - split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger_ner) + split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger_ner, is_flair) sentences_split.extend(split_sentences) sentences = sentences_split elif split_docs_value > 0: - sentences = self.split_text_in_parts(text, split_docs_value, tagger_ner) + sentences = self.split_text_in_parts(text, split_docs_value, tagger_ner, is_flair) else: sentences = [ text ] res[doc] = {} @@ -173,7 +173,7 @@ def combine_tokens_to_text(self, token_list): return text - def split_text_in_parts(self, text, split_docs_value, tagger_ner): + def split_text_in_parts(self, text, split_docs_value, tagger_ner, is_flair): """ Splits text in parts of as most split_docs_value tokens. Texts are split at sentence boundaries. If a sentence is longer than the limit it will be split in parts of @@ -183,6 +183,8 @@ def split_text_in_parts(self, text, split_docs_value, tagger_ner): token_lists = [] texts = [] for sentence in sentences: + if is_flair: + raise Exception("Splitting documents does not work in combination with Flair") sentence_tokens, token_spans = self.split_sentence_in_bert_tokens(sentence, tagger_ner) if len(token_lists) == 0 or (len(token_lists[-1]) + len(sentence_tokens)) > split_docs_value: token_lists.append([]) From 151bac4ec30b37c590798efd69f591568ac7c90c Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 19 Jan 2023 10:00:34 +0000 Subject: [PATCH 19/61] move evaluate_predictions.py to scripts --- {src/REL => scripts}/evaluate_predictions.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {src/REL => scripts}/evaluate_predictions.py (100%) diff --git a/src/REL/evaluate_predictions.py b/scripts/evaluate_predictions.py similarity index 100% rename from src/REL/evaluate_predictions.py rename to scripts/evaluate_predictions.py From 894837d8b6561fa87b9e32050b84b942cf335879 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 19 Jan 2023 10:00:59 +0000 Subject: [PATCH 20/61] move evaluate_predictions.py to scripts --- scripts/efficiency_test.py | 2 +- tests/test_evaluate_predictions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index b1e2472..2d3780a 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -1,5 +1,5 @@ import argparse -from REL.evaluate_predictions import evaluate +from scripts.evaluate_predictions import evaluate import json import numpy as np import os diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index c869831..3fbf1d8 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -1,4 +1,4 @@ -from REL.evaluate_predictions import compare_and_count_entities, print_scores +from scripts.evaluate_predictions import compare_and_count_entities, print_scores def test_perfect(): From 08a593854104cff4e4a70919ef43f2d3b4541a3b Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 09:25:46 +0000 Subject: [PATCH 21/61] simplified NER tagger selection --- scripts/efficiency_test.py | 14 ++++++-------- src/REL/ner/set_tagger_ner.py | 23 +++++------------------ src/REL/server.py | 9 +++------ 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 2d3780a..99950db 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -13,11 +13,7 @@ parser.add_argument("--max_docs", help = "number of documents") parser.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") parser.add_argument("--split_docs_value", help = "threshold number of tokens to split document") -parser.add_argument("--use_bert_base_cased", help = "use Bert base cased rather than Flair", action="store_true") -parser.add_argument("--use_bert_large_cased", help = "use Bert large cased rather than Flair", action="store_true") -parser.add_argument("--use_bert_base_uncased", help = "use Bert base uncased rather than Flair", action="store_true") -parser.add_argument("--use_bert_large_uncased", help = "use Bert large uncased rather than Flair", action="store_true") -parser.add_argument("--use_bert_multilingual", help = "use Bert multilingual rather than Flair", action="store_true") +parser.add_argument("--tagger_ner_name", help = "mention detection tagger") parser.add_argument("--use_server", help = "use server", action="store_true") parser.add_argument("--wiki_version", help = "Wiki version") args = parser.parse_args() @@ -45,9 +41,6 @@ datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()["aida_testB"] use_server = args.use_server -tagger_ner, tagger_ner_name = set_tagger_ner(args.use_bert_base_cased, args.use_bert_base_uncased, args.use_bert_large_cased, args.use_bert_large_uncased, args.use_bert_multilingual) - -print(f"max_docs={max_docs} wiki_version={wiki_version} tagger_ner_name={tagger_ner_name} process_sentences={process_sentences} split_docs_value={split_docs_value}") docs = {} all_results = {} @@ -69,6 +62,7 @@ docs[doc] = [text, []] # Demo script that can be used to query the API. if use_server: + print(f"max_docs={max_docs} use_server={use_server}") myjson = { "text": text, "spans": [ @@ -110,6 +104,10 @@ from REL.ner.bert_wrapper import load_bert_ner flair.device = torch.device("cpu") + tagger_ner_name = args.tagger_ner_name + tagger_ner = set_tagger_ner(tagger_ner_name) + + print(f"max_docs={max_docs} tagger_ner_name={tagger_ner_name} wiki_version={wiki_version} process_sentences={process_sentences} split_docs_value={split_docs_value}") mention_detection = MentionDetection(base_url, wiki_version) diff --git a/src/REL/ner/set_tagger_ner.py b/src/REL/ner/set_tagger_ner.py index 23edc4a..363ac7c 100644 --- a/src/REL/ner/set_tagger_ner.py +++ b/src/REL/ner/set_tagger_ner.py @@ -14,25 +14,12 @@ } -def set_tagger_ner(use_bert_base_cased, use_bert_base_uncased, use_bert_large_cased, use_bert_large_uncased, use_bert_multilingual): - if use_bert_base_cased: - tagger_ner_name = "bert_base_cased" - elif use_bert_base_uncased: - tagger_ner_name = "bert_base_uncased" - elif use_bert_large_cased: - tagger_ner_name = "bert_large_cased" - elif use_bert_large_uncased: - tagger_ner_name = "bert_large_uncased" - elif use_bert_multilingual: - tagger_ner_name = "bert_multilingual" - else: - tagger_ner_name = "flair" - - if re.search("flair", tagger_ner_name): - tagger_ner = SequenceTagger.load("ner-fast") - elif re.search("bert", tagger_ner_name): +def set_tagger_ner(tagger_ner_name): + if re.search("^flair", tagger_ner_name): + tagger_ner = SequenceTagger.load(taggers_ner[tagger_ner_name]) + elif re.search("^bert", tagger_ner_name): tagger_ner = load_bert_ner(taggers_ner[tagger_ner_name]) else: raise Exception(f"unknown tagger name: {tagger_ner_name}") - return tagger_ner, tagger_ner_name + return tagger_ner diff --git a/src/REL/server.py b/src/REL/server.py index c0c1ca8..b0a5dac 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -186,17 +186,14 @@ def generate_response(self, text, spans): p.add_argument("--ner-model", default="ner-fast") p.add_argument("--bind", "-b", metavar="ADDRESS", default="0.0.0.0") p.add_argument("--port", "-p", default=5555, type=int) - p.add_argument("--use_bert_large_cased", help = "use Bert large cased rather than Flair", action="store_true") - p.add_argument("--use_bert_base_cased", help = "use Bert base cased rather than Flair", action="store_true") - p.add_argument("--use_bert_large_uncased", help = "use Bert large uncased rather than Flair", action="store_true") - p.add_argument("--use_bert_base_uncased", help = "use Bert base uncased rather than Flair", action="store_true") - p.add_argument("--use_bert_multilingual", help = "use Bert multilingual rather than Flair", action="store_true") + p.add_argument("--tagger_ner_name", help = "mention detection tagger") p.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") p.add_argument("--split_docs_value", help = "threshold number of tokens to split document") args = p.parse_args() - tagger_ner, tagger_ner_name = set_tagger_ner(args.use_bert_base_cased, args.use_bert_base_uncased, args.use_bert_large_cased, args.use_bert_large_uncased, args.use_bert_multilingual) + tagger_ner_name = args.tagger_ner_name + tagger_ner = set_tagger_ner(tagger_ner_name) split_docs_value = 0 if args.split_docs_value: From 083f7f56ed362fc18994590e4ddf9f90fbae1a85 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 12:54:38 +0000 Subject: [PATCH 22/61] skipped tests requiring data --- tests/test_bert_md.py | 4 ++++ tests/test_ed_pipeline.py | 6 ++++++ tests/test_flair_md.py | 4 ++++ tests/test_instantiation.py | 17 +++++++++++++++++ tests/test_ngram.py | 6 ++++++ 5 files changed, 37 insertions(+) diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py index eb5bc75..b15a306 100644 --- a/tests/test_bert_md.py +++ b/tests/test_bert_md.py @@ -4,6 +4,7 @@ from pathlib import Path import os +import pytest from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import pipeline @@ -12,6 +13,9 @@ from REL.ner.bert_wrapper import load_bert_ner +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_md(): tagger_ner_name = "bert_base_cased" tagger_ner = load_bert_ner("dslim/bert-base-NER") diff --git a/tests/test_ed_pipeline.py b/tests/test_ed_pipeline.py index 06bf11e..baa500f 100644 --- a/tests/test_ed_pipeline.py +++ b/tests/test_ed_pipeline.py @@ -3,12 +3,18 @@ from pathlib import Path +import os +import pytest + from REL.entity_disambiguation import EntityDisambiguation from REL.mention_detection import MentionDetection from REL.ner import Cmns from REL.utils import process_results +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_pipeline(): base_url = Path(__file__).parent wiki_subfolder = "wiki_test" diff --git a/tests/test_flair_md.py b/tests/test_flair_md.py index 606e03b..8242b5b 100644 --- a/tests/test_flair_md.py +++ b/tests/test_flair_md.py @@ -4,12 +4,16 @@ from pathlib import Path import os +import pytest from flair.models import SequenceTagger from REL.mention_detection import MentionDetection +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_md(): # return standard Flair tagger + mention detection object tagger_ner_name = "flair" diff --git a/tests/test_instantiation.py b/tests/test_instantiation.py index 52c3bf2..b431ac8 100644 --- a/tests/test_instantiation.py +++ b/tests/test_instantiation.py @@ -3,6 +3,8 @@ from pathlib import Path +import os +import pytest import torch from REL.entity_disambiguation import EntityDisambiguation @@ -11,6 +13,9 @@ from REL.ner import Cmns +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_entity_disambiguation_instantiation(): return EntityDisambiguation( Path(__file__).parent, @@ -22,20 +27,32 @@ def test_entity_disambiguation_instantiation(): ) +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_cmns_instantiation(): return Cmns(Path(__file__).parent, "wiki_test") +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_mention_detection_instantiation(): return MentionDetection(Path(__file__).parent, "wiki_test") +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_prerank_instantiation(): # NOTE: this is basically just a blank constructor; if this fails, something is # seriously wrong return PreRank({}) +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_mulrel_ranker_instantiation(): # minimal config to make the constructor run config = { diff --git a/tests/test_ngram.py b/tests/test_ngram.py index 664e757..593cb84 100644 --- a/tests/test_ngram.py +++ b/tests/test_ngram.py @@ -3,6 +3,9 @@ from pathlib import Path +import os +import pytest + from REL.ner import Cmns, Span @@ -14,6 +17,9 @@ def compare_spans(a: Span, b: Span, fields=(0, 1, 2)): return True +@pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." +) def test_cmns(): model = Cmns(Path(__file__).parent, "wiki_test", n=5) predictions = model.predict("the brown fox jumped over the lazy dog", None) From 061edc155af0fc2d3da189ee881ebdf66689c223 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 14:22:18 +0000 Subject: [PATCH 23/61] add defaults for arguments --- scripts/efficiency_test.py | 25 +++++++------------------ src/REL/server.py | 9 +++------ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 99950db..e43a37b 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -10,12 +10,12 @@ from REL.training_datasets import TrainingEvaluationDatasets parser = argparse.ArgumentParser() -parser.add_argument("--max_docs", help = "number of documents") +parser.add_argument("--max_docs", help = "number of documents", default="50") parser.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") -parser.add_argument("--split_docs_value", help = "threshold number of tokens to split document") -parser.add_argument("--tagger_ner_name", help = "mention detection tagger") +parser.add_argument("--split_docs_value", help = "threshold number of tokens to split document", default="0") +parser.add_argument("--tagger_ner_name", help = "mention detection tagger", default="flair") parser.add_argument("--use_server", help = "use server", action="store_true") -parser.add_argument("--wiki_version", help = "Wiki version") +parser.add_argument("--wiki_version", help = "Wiki version", default="wiki_2019") args = parser.parse_args() np.random.seed(seed=42) @@ -23,20 +23,9 @@ base_url = os.path.abspath(os.path.dirname(__file__) + "/../data/") process_sentences = args.process_sentences -if args.split_docs_value: - split_docs_value = int(args.split_docs_value) -else: - split_docs_value = 0 - -if args.max_docs: - max_docs = int(args.max_docs) -else: - max_docs = 50 - -if args.wiki_version: - wiki_version = args.wiki_version -else: - wiki_version = "wiki_2019" +split_docs_value = int(args.split_docs_value) +max_docs = int(args.max_docs) +wiki_version = args.wiki_version datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()["aida_testB"] diff --git a/src/REL/server.py b/src/REL/server.py index b0a5dac..82eb78d 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -186,18 +186,15 @@ def generate_response(self, text, spans): p.add_argument("--ner-model", default="ner-fast") p.add_argument("--bind", "-b", metavar="ADDRESS", default="0.0.0.0") p.add_argument("--port", "-p", default=5555, type=int) - p.add_argument("--tagger_ner_name", help = "mention detection tagger") + p.add_argument("--tagger_ner_name", default="flair", help = "mention detection tagger") p.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") - p.add_argument("--split_docs_value", help = "threshold number of tokens to split document") + p.add_argument("--split_docs_value", default="0", help = "threshold number of tokens to split document") args = p.parse_args() tagger_ner_name = args.tagger_ner_name tagger_ner = set_tagger_ner(tagger_ner_name) - - split_docs_value = 0 - if args.split_docs_value: - split_docs_value = int(args.split_docs_value) + split_docs_value = int(args.split_docs_value) process_sentences = args.process_sentences From 369275fc08e1d9ac69c199ff754a72937342f32a Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 16:33:10 +0000 Subject: [PATCH 24/61] replace next by continue --- scripts/efficiency_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index e43a37b..660050e 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -39,7 +39,7 @@ if x["sentence"] not in sentences: sentences.append(x["sentence"]) if len(sentences) == 0: - next + continue text = ". ".join([x for x in sentences]) if len(docs) >= max_docs: From 4e6703e5664639b052458d14064fcc1c849bdc3f Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 17:38:44 +0100 Subject: [PATCH 25/61] replace with list comprhension Co-authored-by: Stef Smeets --- scripts/efficiency_test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 660050e..01adf9b 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -67,13 +67,12 @@ print(results.json()) print("----------------------------") try: - results_list = [] - for result in results.json(): - results_list.append({ "mention": result[2], "prediction": result[3] }) # Flair + Bert - all_results[doc] = results_list + results_list = [{"mention": result[2], "prediction": result[3]} for result in results.json()] except json.decoder.JSONDecodeError: print("The analysis results are not in json format:", str(results)) - all_results[doc] = [] + results_list = [] + + all_results[doc] = results_list if len(all_results) > 0: evaluate(all_results) From 7b15c1596b4d8b5a7f4f5bb3a1fd3d33877c2c40 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 17:42:01 +0100 Subject: [PATCH 26/61] simplify code Co-authored-by: Stef Smeets --- scripts/efficiency_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index 01adf9b..b49b749 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -74,9 +74,10 @@ all_results[doc] = results_list -if len(all_results) > 0: +if all_results: evaluate(all_results) + # --------------------- Now total -------------------------------- # ------------- RUN SEPARATELY TO BALANCE LOAD-------------------- if not use_server: From 1ca5fbef123adb7a0df15a4810c211a1c0e98642 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 17:45:48 +0100 Subject: [PATCH 27/61] values became keyword arguments Co-authored-by: Stef Smeets --- scripts/efficiency_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index b49b749..e5eb06d 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -101,7 +101,8 @@ mention_detection = MentionDetection(base_url, wiki_version) start = time() - mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) + mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner_name=tagger_ner_name, process_sentences=process_sentences, split_docs_value=split_docs_value, tagger_ner=tagger_ner) + print("MD took: {} seconds".format(round(time() - start, 2))) # 3. Load ED model. From 9e3dae1b293ca8ea2ab6c90e1031601c07f93a3d Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 30 Jan 2023 17:48:38 +0100 Subject: [PATCH 28/61] string formatting replaced rounding Co-authored-by: Stef Smeets --- scripts/efficiency_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/efficiency_test.py b/scripts/efficiency_test.py index e5eb06d..127f645 100644 --- a/scripts/efficiency_test.py +++ b/scripts/efficiency_test.py @@ -115,6 +115,7 @@ # 4. Entity disambiguation. start = time() predictions, timing = ed_model.predict(mentions_dataset) - print("ED took: {} seconds".format(round(time() - start, 2))) + print(f"ED took: {time() - start:.2f} seconds") + evaluate(predictions) From 25a5bf15cff70d9bb9012d91edf36e5cb43a93cb Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 14 Feb 2023 19:20:41 +0100 Subject: [PATCH 29/61] Update tests/test_evaluate_predictions.py Co-authored-by: Stef Smeets --- tests/test_evaluate_predictions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index 3fbf1d8..d95623d 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -28,7 +28,7 @@ def test_md_wrong(): def test_combined(): gold_entities = [ [ "1", "1" ], [ "1", "1" ], [ "2", "2" ] ] predicted_entities = [ [ "0", "0" ], [ "0", "1" ], [ "1", "0" ], [ "1", "1" ] ] - correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) - precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) - assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [100/2, 100*2/3, 100*4/7, 100/4, 100/3, 100*2/7], "should be various scores" + counts = compare_and_count_entities(gold_entities, predicted_entities) + scores = print_scores(*counts) + assert scores == [100/2, 100*2/3, 100*4/7, 100/4, 100/3, 100*2/7], "should be various scores" From 508483a1084a88ad6fd9a67623045c824cd107eb Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 14 Feb 2023 19:21:14 +0100 Subject: [PATCH 30/61] Update tests/test_evaluate_predictions.py Co-authored-by: Stef Smeets --- tests/test_evaluate_predictions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index d95623d..36e9172 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -12,9 +12,9 @@ def test_perfect(): def test_el_wrong(): gold_entities = [ [ "1", "1" ] ] predicted_entities = [ [ "1", "0" ] ] - correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) - precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) - assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [100, 100, 100, 0, 0, 0], "should be perfect MD and failed EL" + counts = compare_and_count_entities(gold_entities, predicted_entities) + scores = print_scores(*counts) + assert scores == [100, 100, 100, 0, 0, 0], "should be perfect MD and failed EL" def test_md_wrong(): From cd86c356f4cfdc94fa7a668047c5195073dc2183 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 14 Feb 2023 19:21:37 +0100 Subject: [PATCH 31/61] Update tests/test_evaluate_predictions.py Co-authored-by: Stef Smeets --- tests/test_evaluate_predictions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index 36e9172..4d49995 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -20,9 +20,9 @@ def test_el_wrong(): def test_md_wrong(): gold_entities = [ [ "1", "1" ] ] predicted_entities = [ [ "0", "1" ] ] - correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) - precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) - assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [0, 0, 0, 0, 0, 0], "should be failed MD and failed EL" + counts = compare_and_count_entities(gold_entities, predicted_entities) + scores = print_scores(*counts) + assert scores == [0, 0, 0, 0, 0, 0], "should be failed MD and failed EL" def test_combined(): From fa188e8a6e6b587d935de4402a734f5869bd86a6 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 14 Feb 2023 19:21:56 +0100 Subject: [PATCH 32/61] Update tests/test_evaluate_predictions.py Co-authored-by: Stef Smeets --- tests/test_evaluate_predictions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index 4d49995..adc5084 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -4,9 +4,9 @@ def test_perfect(): gold_entities = [ [ "1", "1" ] ] predicted_entities = [ [ "1", "1" ] ] - correct, wrong_md, wrong_el, missed = compare_and_count_entities(gold_entities, predicted_entities) - precision_md, recall_md, f1_md, precision_el, recall_el, f1_el = print_scores(correct, wrong_md, wrong_el, missed) - assert [precision_md, recall_md, f1_md, precision_el, recall_el, f1_el] == [100, 100, 100, 100, 100, 100], "should be perfect MD and perfect EL" + counts = compare_and_count_entities(gold_entities, predicted_entities) + scores = print_scores(*counts) + assert scores == [100, 100, 100, 100, 100, 100], "should be perfect MD and perfect EL" def test_el_wrong(): From cbdc79139d21fe52e010fd87a7b36fabff75cfc2 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 14 Feb 2023 18:27:20 +0000 Subject: [PATCH 33/61] fixed data format --- tests/test_evaluate_predictions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index adc5084..0000624 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -6,7 +6,7 @@ def test_perfect(): predicted_entities = [ [ "1", "1" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert scores == [100, 100, 100, 100, 100, 100], "should be perfect MD and perfect EL" + assert list(scores) == [100, 100, 100, 100, 100, 100], "should be perfect MD and perfect EL" def test_el_wrong(): @@ -14,7 +14,7 @@ def test_el_wrong(): predicted_entities = [ [ "1", "0" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert scores == [100, 100, 100, 0, 0, 0], "should be perfect MD and failed EL" + assert list(scores) == [100, 100, 100, 0, 0, 0], "should be perfect MD and failed EL" def test_md_wrong(): @@ -22,7 +22,7 @@ def test_md_wrong(): predicted_entities = [ [ "0", "1" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert scores == [0, 0, 0, 0, 0, 0], "should be failed MD and failed EL" + assert list(scores) == [0, 0, 0, 0, 0, 0], "should be failed MD and failed EL" def test_combined(): @@ -30,5 +30,5 @@ def test_combined(): predicted_entities = [ [ "0", "0" ], [ "0", "1" ], [ "1", "0" ], [ "1", "1" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert scores == [100/2, 100*2/3, 100*4/7, 100/4, 100/3, 100*2/7], "should be various scores" + assert list(scores) == [100/2, 100*2/3, 100*4/7, 100/4, 100/3, 100*2/7], "should be various scores" From d31135ed47669398100b089625988d9fd2be7123 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 13 Feb 2024 17:28:58 +0100 Subject: [PATCH 34/61] make tests work --- {scripts => src/scripts}/WikiExtractor.py | 0 {scripts => src/scripts}/__init__.py | 0 {scripts => src/scripts}/code_tutorials/batch_EL.py | 0 .../scripts}/code_tutorials/example_custom_MD.py | 0 .../scripts}/code_tutorials/generate_p_e_m.py | 0 .../scripts}/code_tutorials/generate_train_val.py | 0 .../scripts}/code_tutorials/predict_EL.py | 0 .../scripts}/code_tutorials/run_server.py | 0 .../scripts}/code_tutorials/run_server_temp.py | 0 {scripts => src/scripts}/code_tutorials/test_API.py | 0 {scripts => src/scripts}/code_tutorials/train_LR.py | 0 .../scripts}/code_tutorials/train_eval_ED.py | 0 .../scripts}/comparison_BLINK/run_server.py | 0 {scripts => src/scripts}/comparison_BLINK/test.py | 0 {scripts => src/scripts}/download_data.sh | 0 {scripts => src/scripts}/efficiency_results.py | 0 {scripts => src/scripts}/efficiency_test.py | 0 {scripts => src/scripts}/evaluate_predictions.py | 0 .../scripts}/gerbil_middleware/.gitignore | 0 .../scripts}/gerbil_middleware/Dockerfile | 0 {scripts => src/scripts}/gerbil_middleware/LICENSE | 0 {scripts => src/scripts}/gerbil_middleware/Makefile | 0 .../scripts}/gerbil_middleware/README.md | 0 .../scripts}/gerbil_middleware/curlExample.sh | 0 .../scripts}/gerbil_middleware/docker-compose.yml | 0 .../scripts}/gerbil_middleware/example.ttl | 0 {scripts => src/scripts}/gerbil_middleware/pom.xml | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar | Bin ...rbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 | 0 ...bil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar | Bin ...rbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 | 0 ...bil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT.jar | Bin .../gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT.pom | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 | 0 .../gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 | 0 .../1.1.0-SNAPSHOT/maven-metadata-local.xml | 0 .../1.1.0-SNAPSHOT/maven-metadata-local.xml.md5 | 0 .../1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1 | 0 .../gerbil.nif.transfer/maven-metadata-local.xml | 0 .../maven-metadata-local.xml.md5 | 0 .../maven-metadata-local.xml.sha1 | 0 .../2.2.1/org.restlet.ext.servlet-2.2.1.jar | Bin .../2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5 | 0 .../2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1 | 0 .../2.2.1/org.restlet.ext.servlet-2.2.1.pom | 0 .../2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5 | 0 .../2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1 | 0 .../maven-metadata-local.xml | 0 .../maven-metadata-local.xml.md5 | 0 .../maven-metadata-local.xml.sha1 | 0 .../restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar | Bin .../org.restlet/2.2.1/org.restlet-2.2.1.jar.md5 | 0 .../org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1 | 0 .../restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom | 0 .../org.restlet/2.2.1/org.restlet-2.2.1.pom.md5 | 0 .../org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1 | 0 .../restlet/org.restlet/maven-metadata-local.xml | 0 .../org.restlet/maven-metadata-local.xml.md5 | 0 .../org.restlet/maven-metadata-local.xml.sha1 | 0 .../java/org/aksw/gerbil/ws4test/EDResource.java | 0 .../gerbil/ws4test/LocalIntermediateWebserver.java | 0 .../java/org/aksw/gerbil/ws4test/MyResource.java | 0 .../org/aksw/gerbil/ws4test/SpotlightClient.java | 0 .../org/aksw/gerbil/ws4test/SpotlightResource.java | 0 .../org/aksw/gerbil/ws4test/TestApplication.java | 0 .../main/java/org/aksw/gerbil/ws4test/data_format | 0 .../src/main/resources/log4j.properties | 0 .../src/main/webapp/WEB-INF/web.xml | 0 {scripts => src/scripts}/truecase/README.md | 0 {scripts => src/scripts}/truecase/relq.py | 0 {scripts => src/scripts}/truecase/truecase-m.py | 0 {scripts => src/scripts}/update_db_pem.py | 0 {scripts => src/scripts}/w2v/preprocess.sh | 0 {scripts => src/scripts}/w2v/train.sh | 0 78 files changed, 0 insertions(+), 0 deletions(-) rename {scripts => src/scripts}/WikiExtractor.py (100%) rename {scripts => src/scripts}/__init__.py (100%) rename {scripts => src/scripts}/code_tutorials/batch_EL.py (100%) rename {scripts => src/scripts}/code_tutorials/example_custom_MD.py (100%) rename {scripts => src/scripts}/code_tutorials/generate_p_e_m.py (100%) rename {scripts => src/scripts}/code_tutorials/generate_train_val.py (100%) rename {scripts => src/scripts}/code_tutorials/predict_EL.py (100%) rename {scripts => src/scripts}/code_tutorials/run_server.py (100%) rename {scripts => src/scripts}/code_tutorials/run_server_temp.py (100%) rename {scripts => src/scripts}/code_tutorials/test_API.py (100%) rename {scripts => src/scripts}/code_tutorials/train_LR.py (100%) rename {scripts => src/scripts}/code_tutorials/train_eval_ED.py (100%) rename {scripts => src/scripts}/comparison_BLINK/run_server.py (100%) rename {scripts => src/scripts}/comparison_BLINK/test.py (100%) rename {scripts => src/scripts}/download_data.sh (100%) rename {scripts => src/scripts}/efficiency_results.py (100%) rename {scripts => src/scripts}/efficiency_test.py (100%) rename {scripts => src/scripts}/evaluate_predictions.py (100%) rename {scripts => src/scripts}/gerbil_middleware/.gitignore (100%) rename {scripts => src/scripts}/gerbil_middleware/Dockerfile (100%) rename {scripts => src/scripts}/gerbil_middleware/LICENSE (100%) rename {scripts => src/scripts}/gerbil_middleware/Makefile (100%) rename {scripts => src/scripts}/gerbil_middleware/README.md (100%) rename {scripts => src/scripts}/gerbil_middleware/curlExample.sh (100%) rename {scripts => src/scripts}/gerbil_middleware/docker-compose.yml (100%) rename {scripts => src/scripts}/gerbil_middleware/example.ttl (100%) rename {scripts => src/scripts}/gerbil_middleware/pom.xml (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.md5 (100%) rename {scripts => src/scripts}/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.sha1 (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/EDResource.java (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/MyResource.java (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/data_format (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/resources/log4j.properties (100%) rename {scripts => src/scripts}/gerbil_middleware/src/main/webapp/WEB-INF/web.xml (100%) rename {scripts => src/scripts}/truecase/README.md (100%) rename {scripts => src/scripts}/truecase/relq.py (100%) rename {scripts => src/scripts}/truecase/truecase-m.py (100%) rename {scripts => src/scripts}/update_db_pem.py (100%) rename {scripts => src/scripts}/w2v/preprocess.sh (100%) rename {scripts => src/scripts}/w2v/train.sh (100%) diff --git a/scripts/WikiExtractor.py b/src/scripts/WikiExtractor.py similarity index 100% rename from scripts/WikiExtractor.py rename to src/scripts/WikiExtractor.py diff --git a/scripts/__init__.py b/src/scripts/__init__.py similarity index 100% rename from scripts/__init__.py rename to src/scripts/__init__.py diff --git a/scripts/code_tutorials/batch_EL.py b/src/scripts/code_tutorials/batch_EL.py similarity index 100% rename from scripts/code_tutorials/batch_EL.py rename to src/scripts/code_tutorials/batch_EL.py diff --git a/scripts/code_tutorials/example_custom_MD.py b/src/scripts/code_tutorials/example_custom_MD.py similarity index 100% rename from scripts/code_tutorials/example_custom_MD.py rename to src/scripts/code_tutorials/example_custom_MD.py diff --git a/scripts/code_tutorials/generate_p_e_m.py b/src/scripts/code_tutorials/generate_p_e_m.py similarity index 100% rename from scripts/code_tutorials/generate_p_e_m.py rename to src/scripts/code_tutorials/generate_p_e_m.py diff --git a/scripts/code_tutorials/generate_train_val.py b/src/scripts/code_tutorials/generate_train_val.py similarity index 100% rename from scripts/code_tutorials/generate_train_val.py rename to src/scripts/code_tutorials/generate_train_val.py diff --git a/scripts/code_tutorials/predict_EL.py b/src/scripts/code_tutorials/predict_EL.py similarity index 100% rename from scripts/code_tutorials/predict_EL.py rename to src/scripts/code_tutorials/predict_EL.py diff --git a/scripts/code_tutorials/run_server.py b/src/scripts/code_tutorials/run_server.py similarity index 100% rename from scripts/code_tutorials/run_server.py rename to src/scripts/code_tutorials/run_server.py diff --git a/scripts/code_tutorials/run_server_temp.py b/src/scripts/code_tutorials/run_server_temp.py similarity index 100% rename from scripts/code_tutorials/run_server_temp.py rename to src/scripts/code_tutorials/run_server_temp.py diff --git a/scripts/code_tutorials/test_API.py b/src/scripts/code_tutorials/test_API.py similarity index 100% rename from scripts/code_tutorials/test_API.py rename to src/scripts/code_tutorials/test_API.py diff --git a/scripts/code_tutorials/train_LR.py b/src/scripts/code_tutorials/train_LR.py similarity index 100% rename from scripts/code_tutorials/train_LR.py rename to src/scripts/code_tutorials/train_LR.py diff --git a/scripts/code_tutorials/train_eval_ED.py b/src/scripts/code_tutorials/train_eval_ED.py similarity index 100% rename from scripts/code_tutorials/train_eval_ED.py rename to src/scripts/code_tutorials/train_eval_ED.py diff --git a/scripts/comparison_BLINK/run_server.py b/src/scripts/comparison_BLINK/run_server.py similarity index 100% rename from scripts/comparison_BLINK/run_server.py rename to src/scripts/comparison_BLINK/run_server.py diff --git a/scripts/comparison_BLINK/test.py b/src/scripts/comparison_BLINK/test.py similarity index 100% rename from scripts/comparison_BLINK/test.py rename to src/scripts/comparison_BLINK/test.py diff --git a/scripts/download_data.sh b/src/scripts/download_data.sh similarity index 100% rename from scripts/download_data.sh rename to src/scripts/download_data.sh diff --git a/scripts/efficiency_results.py b/src/scripts/efficiency_results.py similarity index 100% rename from scripts/efficiency_results.py rename to src/scripts/efficiency_results.py diff --git a/scripts/efficiency_test.py b/src/scripts/efficiency_test.py similarity index 100% rename from scripts/efficiency_test.py rename to src/scripts/efficiency_test.py diff --git a/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py similarity index 100% rename from scripts/evaluate_predictions.py rename to src/scripts/evaluate_predictions.py diff --git a/scripts/gerbil_middleware/.gitignore b/src/scripts/gerbil_middleware/.gitignore similarity index 100% rename from scripts/gerbil_middleware/.gitignore rename to src/scripts/gerbil_middleware/.gitignore diff --git a/scripts/gerbil_middleware/Dockerfile b/src/scripts/gerbil_middleware/Dockerfile similarity index 100% rename from scripts/gerbil_middleware/Dockerfile rename to src/scripts/gerbil_middleware/Dockerfile diff --git a/scripts/gerbil_middleware/LICENSE b/src/scripts/gerbil_middleware/LICENSE similarity index 100% rename from scripts/gerbil_middleware/LICENSE rename to src/scripts/gerbil_middleware/LICENSE diff --git a/scripts/gerbil_middleware/Makefile b/src/scripts/gerbil_middleware/Makefile similarity index 100% rename from scripts/gerbil_middleware/Makefile rename to src/scripts/gerbil_middleware/Makefile diff --git a/scripts/gerbil_middleware/README.md b/src/scripts/gerbil_middleware/README.md similarity index 100% rename from scripts/gerbil_middleware/README.md rename to src/scripts/gerbil_middleware/README.md diff --git a/scripts/gerbil_middleware/curlExample.sh b/src/scripts/gerbil_middleware/curlExample.sh similarity index 100% rename from scripts/gerbil_middleware/curlExample.sh rename to src/scripts/gerbil_middleware/curlExample.sh diff --git a/scripts/gerbil_middleware/docker-compose.yml b/src/scripts/gerbil_middleware/docker-compose.yml similarity index 100% rename from scripts/gerbil_middleware/docker-compose.yml rename to src/scripts/gerbil_middleware/docker-compose.yml diff --git a/scripts/gerbil_middleware/example.ttl b/src/scripts/gerbil_middleware/example.ttl similarity index 100% rename from scripts/gerbil_middleware/example.ttl rename to src/scripts/gerbil_middleware/example.ttl diff --git a/scripts/gerbil_middleware/pom.xml b/src/scripts/gerbil_middleware/pom.xml similarity index 100% rename from scripts/gerbil_middleware/pom.xml rename to src/scripts/gerbil_middleware/pom.xml diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.md5 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.md5 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.md5 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.md5 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.md5 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.md5 diff --git a/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.sha1 b/src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.sha1 rename to src/scripts/gerbil_middleware/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.sha1 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.md5 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.md5 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.md5 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.sha1 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.sha1 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.sha1 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.md5 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.md5 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.md5 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.md5 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.md5 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.md5 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.md5 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.md5 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.md5 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.md5 diff --git a/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.sha1 b/src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.sha1 similarity index 100% rename from scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.sha1 rename to src/scripts/gerbil_middleware/repository/org/restlet/org.restlet/maven-metadata-local.xml.sha1 diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/EDResource.java b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/EDResource.java similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/EDResource.java rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/EDResource.java diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/MyResource.java b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/MyResource.java similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/MyResource.java rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/MyResource.java diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java diff --git a/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/data_format b/src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/data_format similarity index 100% rename from scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/data_format rename to src/scripts/gerbil_middleware/src/main/java/org/aksw/gerbil/ws4test/data_format diff --git a/scripts/gerbil_middleware/src/main/resources/log4j.properties b/src/scripts/gerbil_middleware/src/main/resources/log4j.properties similarity index 100% rename from scripts/gerbil_middleware/src/main/resources/log4j.properties rename to src/scripts/gerbil_middleware/src/main/resources/log4j.properties diff --git a/scripts/gerbil_middleware/src/main/webapp/WEB-INF/web.xml b/src/scripts/gerbil_middleware/src/main/webapp/WEB-INF/web.xml similarity index 100% rename from scripts/gerbil_middleware/src/main/webapp/WEB-INF/web.xml rename to src/scripts/gerbil_middleware/src/main/webapp/WEB-INF/web.xml diff --git a/scripts/truecase/README.md b/src/scripts/truecase/README.md similarity index 100% rename from scripts/truecase/README.md rename to src/scripts/truecase/README.md diff --git a/scripts/truecase/relq.py b/src/scripts/truecase/relq.py similarity index 100% rename from scripts/truecase/relq.py rename to src/scripts/truecase/relq.py diff --git a/scripts/truecase/truecase-m.py b/src/scripts/truecase/truecase-m.py similarity index 100% rename from scripts/truecase/truecase-m.py rename to src/scripts/truecase/truecase-m.py diff --git a/scripts/update_db_pem.py b/src/scripts/update_db_pem.py similarity index 100% rename from scripts/update_db_pem.py rename to src/scripts/update_db_pem.py diff --git a/scripts/w2v/preprocess.sh b/src/scripts/w2v/preprocess.sh similarity index 100% rename from scripts/w2v/preprocess.sh rename to src/scripts/w2v/preprocess.sh diff --git a/scripts/w2v/train.sh b/src/scripts/w2v/train.sh similarity index 100% rename from scripts/w2v/train.sh rename to src/scripts/w2v/train.sh From 10a3d87494c56dca654a02af7eaf1b476979fccd Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 13 Feb 2024 17:37:45 +0100 Subject: [PATCH 35/61] make tests work --- tests/test_instantiation.py | 12 ++++++------ tests/test_ngram.py | 9 ++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/test_instantiation.py b/tests/test_instantiation.py index b431ac8..16a34a2 100644 --- a/tests/test_instantiation.py +++ b/tests/test_instantiation.py @@ -17,28 +17,28 @@ os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." ) def test_entity_disambiguation_instantiation(): - return EntityDisambiguation( + assert True == bool(EntityDisambiguation( Path(__file__).parent, "wiki_test", { "mode": "eval", "model_path": Path(__file__).parent / "wiki_test" / "generated" / "model", }, - ) + )) @pytest.mark.skipif( os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." ) def test_cmns_instantiation(): - return Cmns(Path(__file__).parent, "wiki_test") + assert True == bool(Cmns(Path(__file__).parent, "wiki_test")) @pytest.mark.skipif( os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." ) def test_mention_detection_instantiation(): - return MentionDetection(Path(__file__).parent, "wiki_test") + assert True == bool(MentionDetection(Path(__file__).parent, "wiki_test")) @pytest.mark.skipif( @@ -47,7 +47,7 @@ def test_mention_detection_instantiation(): def test_prerank_instantiation(): # NOTE: this is basically just a blank constructor; if this fails, something is # seriously wrong - return PreRank({}) + assert True == bool(PreRank({})) @pytest.mark.skipif( @@ -63,4 +63,4 @@ def test_mulrel_ranker_instantiation(): "use_local": True, "use_pad_ent": True, } - return MulRelRanker(config, torch.device("cpu")) + assert True == bool(MulRelRanker(config, torch.device("cpu"))) diff --git a/tests/test_ngram.py b/tests/test_ngram.py index 593cb84..f849944 100644 --- a/tests/test_ngram.py +++ b/tests/test_ngram.py @@ -10,9 +10,12 @@ def compare_spans(a: Span, b: Span, fields=(0, 1, 2)): + if len(a) != len(b): + return False for f in fields: - if a[f] != b[f]: - return False + for index in range(0, len(a)): + if a[index][f] != b[index][f]: + return (False, a[index][f], b[index][f]) else: return True @@ -34,4 +37,4 @@ def test_cmns(): Span("dog", 35, 38, None, None), ] - return compare_spans(predictions, labels) + assert compare_spans(predictions, labels) == True From 93899ed6254eb95b293eaca1b6da725e2b979788 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 13 Feb 2024 17:42:02 +0100 Subject: [PATCH 36/61] make tests work --- src/REL/server.py | 2 +- src/scripts/efficiency_test.py | 5 +++-- src/scripts/evaluate_predictions.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/REL/server.py b/src/REL/server.py index 82eb78d..14d24db 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -180,7 +180,7 @@ def generate_response(self, text, spans): from REL.ner.bert_wrapper import load_bert_ner p = argparse.ArgumentParser() - p.add_argument("--base_url", default=os.path.abspath(os.path.dirname(__file__) + "/../../data/")) + p.add_argument("--base_url", default=os.path.abspath(os.path.dirname(__file__) + "/../data/")) p.add_argument("--wiki_version", default="wiki_2019") p.add_argument("--ed-model", default="ed-wiki-2019") p.add_argument("--ner-model", default="ner-fast") diff --git a/src/scripts/efficiency_test.py b/src/scripts/efficiency_test.py index 127f645..8c73b92 100644 --- a/src/scripts/efficiency_test.py +++ b/src/scripts/efficiency_test.py @@ -49,6 +49,7 @@ if len(text.split()) > 200: docs[doc] = [text, []] + results_list = [] # Demo script that can be used to query the API. if use_server: print(f"max_docs={max_docs} use_server={use_server}") @@ -70,9 +71,9 @@ results_list = [{"mention": result[2], "prediction": result[3]} for result in results.json()] except json.decoder.JSONDecodeError: print("The analysis results are not in json format:", str(results)) - results_list = [] - all_results[doc] = results_list + if results_list: + all_results[doc] = results_list if all_results: evaluate(all_results) diff --git a/src/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py index 90727b5..7b25174 100644 --- a/src/scripts/evaluate_predictions.py +++ b/src/scripts/evaluate_predictions.py @@ -5,7 +5,7 @@ def get_gold_data(doc): - GOLD_DATA_FILE = "./data/generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv" + GOLD_DATA_FILE = "./src/data/generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv" entities = [] in_file = open(GOLD_DATA_FILE, "r") From 72b31e0ae11dfca91ac81816f270d6f5f86e9c86 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 13 Feb 2024 18:08:21 +0100 Subject: [PATCH 37/61] make tests work --- tests/test_bert_md.py | 2 +- tests/test_flair_md.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py index b15a306..e8c3ead 100644 --- a/tests/test_bert_md.py +++ b/tests/test_bert_md.py @@ -21,7 +21,7 @@ def test_md(): tagger_ner = load_bert_ner("dslim/bert-base-NER") process_sentences = False split_docs_value = 0 - base_url = os.path.dirname(__file__) + "/../data/" + base_url = os.path.dirname(__file__) + "/../src/data/" wiki_version = "wiki_2019" md = MentionDetection(base_url, wiki_version) diff --git a/tests/test_flair_md.py b/tests/test_flair_md.py index 8242b5b..5c0d445 100644 --- a/tests/test_flair_md.py +++ b/tests/test_flair_md.py @@ -20,7 +20,7 @@ def test_md(): tagger_ner = SequenceTagger.load("ner-fast") process_sentences = True split_docs_value = 0 - base_url = os.path.dirname(__file__) + "/../data/" + base_url = os.path.dirname(__file__) + "/../src/data/" wiki_version = "wiki_2019" md = MentionDetection(base_url, wiki_version) From 44dc91d531b1609aa7d00822217eccaea6954e24 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 13 Feb 2024 18:12:42 +0100 Subject: [PATCH 38/61] removed redundant function --- src/REL/mention_detection.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index 6de019e..b6e42ee 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -160,19 +160,6 @@ def split_sentence_in_bert_tokens(self, sentence, tagger_ner): return tokens, token_spans - - def combine_tokens_to_text(self, token_list): - text = "" - for token in token_list: - if re.search("^##", token): - text += re.sub("^##", "", token) - elif text == "": - text = token - else: - text += " " + token - return text - - def split_text_in_parts(self, text, split_docs_value, tagger_ner, is_flair): """ Splits text in parts of as most split_docs_value tokens. Texts are split at sentence From ea80f2fa8b6c5da2cc1e0346ca152dfed6f9715a Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 20 Feb 2024 18:08:33 +0100 Subject: [PATCH 39/61] use_server on same level --- src/scripts/efficiency_test.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/scripts/efficiency_test.py b/src/scripts/efficiency_test.py index 8c73b92..639c92e 100644 --- a/src/scripts/efficiency_test.py +++ b/src/scripts/efficiency_test.py @@ -38,20 +38,18 @@ for x in datasets[doc]: if x["sentence"] not in sentences: sentences.append(x["sentence"]) - if len(sentences) == 0: - continue - text = ". ".join([x for x in sentences]) + if len(text.split()) > 200: + docs[doc] = [text, []] if len(docs) >= max_docs: - print(f"length docs is {len(docs)}.") - print("====================") break - if len(text.split()) > 200: - docs[doc] = [text, []] - results_list = [] - # Demo script that can be used to query the API. - if use_server: +if use_server: + for i, doc in enumerate(datasets): + text = docs[doc][0] + if len(text.split()) > 200: + results_list = [] + # Demo script that can be used to query the API. print(f"max_docs={max_docs} use_server={use_server}") myjson = { "text": text, @@ -72,11 +70,16 @@ except json.decoder.JSONDecodeError: print("The analysis results are not in json format:", str(results)) - if results_list: - all_results[doc] = results_list + if results_list: + all_results[doc] = results_list + + if len(docs) >= max_docs: + print(f"length docs is {len(docs)}.") + print("====================") + break -if all_results: - evaluate(all_results) + if all_results: + evaluate(all_results) # --------------------- Now total -------------------------------- From 83ff7e6fe3f9be6104455fe093e5379414235ffe Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 27 Feb 2024 14:08:27 +0100 Subject: [PATCH 40/61] fixed unreadable code --- src/REL/db/base.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/REL/db/base.py b/src/REL/db/base.py index 8b66db5..d5d1530 100644 --- a/src/REL/db/base.py +++ b/src/REL/db/base.py @@ -184,10 +184,15 @@ def lookup_wik(self, w, table_name, column): "select {} from {} where word = :word".format(column, table_name), {"word": w}, ).fetchone() - try: - res = ( e if e is None else json.loads(e[0].decode()) if column == "p_e_m" else e[0] ) - except Exception: - res = ( e if e is None else json.loads("".join(chr(int(x, 2)) for x in e[0].split())) if column == "p_e_m" else e[0] ) + if not e: + res = None + elif column == "p_e_m": + try: + res = json.loads(e[0].decode()) + except AttributeError: + res = json.loads("".join(chr(int(x, 2)) for x in e[0].split())) + else: + res = e[0] return res From 94cc303bea54866e4a48f8c6a79ddf3594389671 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 27 Feb 2024 15:06:11 +0100 Subject: [PATCH 41/61] base_url for defining path --- src/scripts/efficiency_test.py | 4 ++-- src/scripts/evaluate_predictions.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/scripts/efficiency_test.py b/src/scripts/efficiency_test.py index 639c92e..c21a64e 100644 --- a/src/scripts/efficiency_test.py +++ b/src/scripts/efficiency_test.py @@ -79,7 +79,7 @@ break if all_results: - evaluate(all_results) + evaluate(all_results, base_url) # --------------------- Now total -------------------------------- @@ -122,4 +122,4 @@ print(f"ED took: {time() - start:.2f} seconds") - evaluate(predictions) + evaluate(predictions, base_url) diff --git a/src/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py index 7b25174..8588f7e 100644 --- a/src/scripts/evaluate_predictions.py +++ b/src/scripts/evaluate_predictions.py @@ -1,11 +1,12 @@ +import os import re UNUSED = -1 -def get_gold_data(doc): - GOLD_DATA_FILE = "./src/data/generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv" +def get_gold_data(doc, base_url): + GOLD_DATA_FILE = os.path.join(base_url, "generic/test_datasets/AIDA/AIDA-YAGO2-dataset.tsv") entities = [] in_file = open(GOLD_DATA_FILE, "r") @@ -118,13 +119,13 @@ def print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): return precision_md, recall_md, f1_md, precision_el, recall_el, f1_el -def evaluate(predictions): +def evaluate(predictions, base_url): correct_all = 0 wrong_md_all = 0 wrong_el_all = 0 missed_all = 0 for doc in predictions: - gold_entities = get_gold_data(doc) + gold_entities = get_gold_data(doc, base_url) predicted_entities = [] for mention in predictions[doc]: predicted_entities.append([mention["mention"], mention["prediction"]]) From cdcf86c74634d9057b0243a1ffaadbb158b2efba Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 27 Feb 2024 16:07:50 +0100 Subject: [PATCH 42/61] use startswith iso re.search --- src/scripts/evaluate_predictions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py index 8588f7e..eaf8e72 100644 --- a/src/scripts/evaluate_predictions.py +++ b/src/scripts/evaluate_predictions.py @@ -11,15 +11,16 @@ def get_gold_data(doc, base_url): in_file = open(GOLD_DATA_FILE, "r") for line in in_file: - if re.search(f"^-DOCSTART- \({doc} ", line): + if line.startswith(f"-DOCSTART- ({doc} "): break for line in in_file: - if re.search(f"^-DOCSTART- ", line): + if line.startswith(f"-DOCSTART- "): break fields = line.strip().split("\t") if len(fields) > 3: if fields[1] == "B": entities.append([fields[2], fields[3]]) + in_file.close() return entities From 8a051976024243d7de2572c3e9a4fba73fe3b177 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 27 Feb 2024 18:31:41 +0100 Subject: [PATCH 43/61] simplified computations --- src/scripts/evaluate_predictions.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py index eaf8e72..de1e287 100644 --- a/src/scripts/evaluate_predictions.py +++ b/src/scripts/evaluate_predictions.py @@ -65,6 +65,11 @@ def compare_entities(gold_entities, predicted_entities): def count_entities(gold_entities, predicted_entities, gold_links, predicted_links): + """ returns: - correct: number of entities correctly identified and correctly linked + - wrong_md: number of entities identified but wrong + - wrong_el: number of entities correctly identified but incorrectly linked + - missed: number of gold standard entities not found and not linked + """ correct = 0 wrong_md = 0 wrong_el = 0 @@ -73,6 +78,7 @@ def count_entities(gold_entities, predicted_entities, gold_links, predicted_link if predicted_links[predicted_i] == UNUSED: wrong_md += 1 elif predicted_entities[predicted_i][1] == gold_entities[predicted_links[predicted_i]][1]: + # assumption: predicted_entities[predicted_i][0] == gold_entities[predicted_links[predicted_i]][0] correct += 1 else: wrong_el += 1 @@ -88,9 +94,10 @@ def compare_and_count_entities(gold_entities, predicted_entities): def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): - if correct_all + wrong_el_all > 0: - precision_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + wrong_md_all) - recall_md = 100*(correct_all + wrong_el_all) / (correct_all + wrong_el_all + missed_all) + correct_md_all = correct_all + wrong_el_all + if correct_md_all > 0: + precision_md = 100 * correct_md_all / (correct_md_all + wrong_md_all) + recall_md = 100 * correct_md_all / (correct_md_all + missed_all) f1_md = 2 * precision_md * recall_md / ( precision_md + recall_md ) else: precision_md = 0 @@ -100,6 +107,7 @@ def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): def compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): + """ reported el_scores are combined md plus el scores """ if correct_all > 0: precision_el = 100 * correct_all / (correct_all + wrong_md_all + wrong_el_all) recall_el = 100 * correct_all / (correct_all + wrong_el_all + missed_all) From 565a96a3ff1f54a2b029cbc002c02ea30c26972d Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 5 Mar 2024 14:27:13 +0100 Subject: [PATCH 44/61] print with % iso f --- src/scripts/evaluate_predictions.py | 12 ++++++------ tests/test_evaluate_predictions.py | 9 ++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py index de1e287..beba193 100644 --- a/src/scripts/evaluate_predictions.py +++ b/src/scripts/evaluate_predictions.py @@ -96,8 +96,8 @@ def compare_and_count_entities(gold_entities, predicted_entities): def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): correct_md_all = correct_all + wrong_el_all if correct_md_all > 0: - precision_md = 100 * correct_md_all / (correct_md_all + wrong_md_all) - recall_md = 100 * correct_md_all / (correct_md_all + missed_all) + precision_md = correct_md_all / (correct_md_all + wrong_md_all) + recall_md = correct_md_all / (correct_md_all + missed_all) f1_md = 2 * precision_md * recall_md / ( precision_md + recall_md ) else: precision_md = 0 @@ -109,8 +109,8 @@ def compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): def compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): """ reported el_scores are combined md plus el scores """ if correct_all > 0: - precision_el = 100 * correct_all / (correct_all + wrong_md_all + wrong_el_all) - recall_el = 100 * correct_all / (correct_all + wrong_el_all + missed_all) + precision_el = correct_all / (correct_all + wrong_md_all + wrong_el_all) + recall_el = correct_all / (correct_all + wrong_el_all + missed_all) f1_el = 2 * precision_el * recall_el / ( precision_el + recall_el ) else: precision_el = 0.0 @@ -123,8 +123,8 @@ def print_scores(correct_all, wrong_md_all, wrong_el_all, missed_all): precision_md, recall_md, f1_md = compute_md_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) precision_el, recall_el, f1_el = compute_el_scores(correct_all, wrong_md_all, wrong_el_all, missed_all) print("Results: PMD RMD FMD PEL REL FEL: ", end="") - print(f"{precision_md:0.1f}% {recall_md:0.1f}% {f1_md:0.1f}% | ",end="") - print(f"{precision_el:0.1f}% {recall_el:0.1f}% {f1_el:0.1f}%") + print(f"{precision_md:0.1%} {recall_md:0.1%} {f1_md:0.1%} | ",end="") + print(f"{precision_el:0.1%} {recall_el:0.1%} {f1_el:0.1%}") return precision_md, recall_md, f1_md, precision_el, recall_el, f1_el diff --git a/tests/test_evaluate_predictions.py b/tests/test_evaluate_predictions.py index 0000624..3e8475f 100644 --- a/tests/test_evaluate_predictions.py +++ b/tests/test_evaluate_predictions.py @@ -1,4 +1,5 @@ from scripts.evaluate_predictions import compare_and_count_entities, print_scores +import pytest def test_perfect(): @@ -6,7 +7,7 @@ def test_perfect(): predicted_entities = [ [ "1", "1" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert list(scores) == [100, 100, 100, 100, 100, 100], "should be perfect MD and perfect EL" + assert list(scores) == [1, 1, 1, 1, 1, 1], "should be perfect MD and perfect EL" def test_el_wrong(): @@ -14,7 +15,7 @@ def test_el_wrong(): predicted_entities = [ [ "1", "0" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert list(scores) == [100, 100, 100, 0, 0, 0], "should be perfect MD and failed EL" + assert list(scores) == [1, 1, 1, 0, 0, 0], "should be perfect MD and failed EL" def test_md_wrong(): @@ -30,5 +31,7 @@ def test_combined(): predicted_entities = [ [ "0", "0" ], [ "0", "1" ], [ "1", "0" ], [ "1", "1" ] ] counts = compare_and_count_entities(gold_entities, predicted_entities) scores = print_scores(*counts) - assert list(scores) == [100/2, 100*2/3, 100*4/7, 100/4, 100/3, 100*2/7], "should be various scores" + target_scores = [1/2, 2/3, 4/7, 1/4, 1/3, 2/7] + for index in range(0, len(scores)): + assert pytest.approx(scores[index], 0.0001) == target_scores[index], "should be various scores" From 9d713cc1ed8d68477a03c6c394216d94ded091e3 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 5 Mar 2024 15:02:40 +0100 Subject: [PATCH 45/61] removed is_flair function argument --- src/REL/mention_detection.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index b6e42ee..58833c1 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -28,7 +28,7 @@ def format_spans(self, dataset, tagger_ner_name, process_sentences, split_docs_v :return: Dictionary with mentions per document. """ - dataset, _, _ = self.split_text(dataset, process_sentences, split_docs_value, tagger_ner, re.search("flair", tagger_ner_name)) + dataset, _, _ = self.split_text(dataset, process_sentences, split_docs_value, tagger_ner) results = {} total_ment = 0 @@ -76,7 +76,7 @@ def split_single(self, text): return sentences - def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner=None, is_flair=False): + def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner=None): """ Splits text into sentences with optional spans (format is a requirement for GERBIL usage). This behavior is required for the default NER-tagger, which during experiments was experienced @@ -88,6 +88,7 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner= res = {} splits = [0] processed_sentences = [] + is_flair = isinstance(tagger_ner, SequenceTagger) for doc in dataset: text, spans = dataset[doc] if process_sentences: @@ -95,11 +96,11 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner= if split_docs_value > 0: sentences_split = [] for sentence in sentences: - split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger_ner, is_flair) + split_sentences = self.split_text_in_parts(sentence, split_docs_value, tagger_ner) sentences_split.extend(split_sentences) sentences = sentences_split elif split_docs_value > 0: - sentences = self.split_text_in_parts(text, split_docs_value, tagger_ner, is_flair) + sentences = self.split_text_in_parts(text, split_docs_value, tagger_ner) else: sentences = [ text ] res[doc] = {} @@ -160,7 +161,7 @@ def split_sentence_in_bert_tokens(self, sentence, tagger_ner): return tokens, token_spans - def split_text_in_parts(self, text, split_docs_value, tagger_ner, is_flair): + def split_text_in_parts(self, text, split_docs_value, tagger_ner): """ Splits text in parts of as most split_docs_value tokens. Texts are split at sentence boundaries. If a sentence is longer than the limit it will be split in parts of @@ -169,6 +170,7 @@ def split_text_in_parts(self, text, split_docs_value, tagger_ner, is_flair): sentences = self.split_single(text) token_lists = [] texts = [] + is_flair = isinstance(tagger_ner, SequenceTagger) for sentence in sentences: if is_flair: raise Exception("Splitting documents does not work in combination with Flair") @@ -220,7 +222,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ # Verify if Flair, else ngram or custom. is_flair = isinstance(tagger_ner, SequenceTagger) dataset_sentences_raw, processed_sentences, splits = self.split_text( - dataset, process_sentences, split_docs_value, tagger_ner, is_flair + dataset, process_sentences, split_docs_value, tagger_ner ) results = {} total_ment = 0 From afbb17fa3e6e8567c5247ed5584bbe9e945a757e Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 5 Mar 2024 15:31:44 +0100 Subject: [PATCH 46/61] removed function argument tagger_ner_name --- src/REL/mention_detection.py | 6 +++--- src/REL/server.py | 7 +++---- src/scripts/efficiency_test.py | 2 +- tests/test_bert_md.py | 5 ++--- tests/test_ed_pipeline.py | 2 +- tests/test_flair_md.py | 5 ++--- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index 58833c1..d4eb67a 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -20,7 +20,7 @@ def __init__(self, base_url, wiki_version): super().__init__(base_url, wiki_version) - def format_spans(self, dataset, tagger_ner_name, process_sentences, split_docs_value=0, tagger_ner=None): + def format_spans(self, dataset, process_sentences, split_docs_value=0, tagger_ner=None): """ Responsible for formatting given spans into dataset for the ED step. More specifically, it returns the mention, its left/right context and a set of candidates. @@ -209,7 +209,7 @@ def prune_word_internal_mentions(self, raw_text, result_doc, total_ment): return result_doc, total_ment - def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_value=0, tagger_ner=None): + def find_mentions(self, dataset, process_sentences, split_docs_value=0, tagger_ner=None): """ Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, it returns the mention, its left/right context and a set of candidates. @@ -253,7 +253,7 @@ def find_mentions(self, dataset, tagger_ner_name, process_sentences, split_docs_ if is_flair else self.combine_entities(tagger_ner(snt)) # predict with BERT ): - if re.search("bert", tagger_ner_name): + if not is_flair: text, start_pos, end_pos, conf, tag = ( sentence[entity["start"]:entity["end"]], # for BERT entity["start"], diff --git a/src/REL/server.py b/src/REL/server.py index 14d24db..ca17761 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -14,7 +14,7 @@ -def make_handler(base_url, wiki_version, ed_model, tagger_ner, tagger_ner_name, process_sentences, split_docs_value=0): +def make_handler(base_url, wiki_version, ed_model, tagger_ner, process_sentences, split_docs_value=0): """ Class/function combination that is used to setup an API that can be used for e.g. GERBIL evaluation. """ @@ -22,7 +22,6 @@ class GetHandler(BaseHTTPRequestHandler): def __init__(self, *args, **kwargs): self.ed_model = ed_model self.tagger_ner = tagger_ner - self.tagger_ner_name = tagger_ner_name self.process_sentences = process_sentences self.split_docs_value = split_docs_value @@ -147,7 +146,7 @@ def generate_response(self, text, spans): # EL processed = {API_DOC: [text, spans]} mentions_dataset, total_ment = self.mention_detection.find_mentions( - processed, self.tagger_ner_name, self.process_sentences, self.split_docs_value, self.tagger_ner + processed, self.process_sentences, self.split_docs_value, self.tagger_ner ) # Disambiguation @@ -204,7 +203,7 @@ def generate_response(self, text, spans): server_address = (args.bind, args.port) server = HTTPServer( server_address, - make_handler(args.base_url, args.wiki_version, ed_model, tagger_ner, tagger_ner_name, process_sentences, split_docs_value) + make_handler(args.base_url, args.wiki_version, ed_model, tagger_ner, process_sentences, split_docs_value) ) try: diff --git a/src/scripts/efficiency_test.py b/src/scripts/efficiency_test.py index c21a64e..7c4a334 100644 --- a/src/scripts/efficiency_test.py +++ b/src/scripts/efficiency_test.py @@ -105,7 +105,7 @@ mention_detection = MentionDetection(base_url, wiki_version) start = time() - mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner_name=tagger_ner_name, process_sentences=process_sentences, split_docs_value=split_docs_value, tagger_ner=tagger_ner) + mentions_dataset, n_mentions = mention_detection.find_mentions(docs, process_sentences=process_sentences, split_docs_value=split_docs_value, tagger_ner=tagger_ner) print("MD took: {} seconds".format(round(time() - start, 2))) diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py index e8c3ead..2ea6d08 100644 --- a/tests/test_bert_md.py +++ b/tests/test_bert_md.py @@ -17,7 +17,6 @@ os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." ) def test_md(): - tagger_ner_name = "bert_base_cased" tagger_ner = load_bert_ner("dslim/bert-base-NER") process_sentences = False split_docs_value = 0 @@ -28,7 +27,7 @@ def test_md(): # first test case: repeating sentences sample1 = {"test_doc": ["Fox, Fox. Fox.", []]} resulting_spans1 = {(0, 3), (5, 3), (10, 3)} - predictions = md.find_mentions(sample1, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) + predictions = md.find_mentions(sample1, process_sentences, split_docs_value, tagger_ner) predicted_spans = [] for i in range(0, 1): p = { @@ -41,7 +40,7 @@ def test_md(): # second test case: excessive whitespace sample2 = {"test_doc": ["Fox, Fox, Fox.", []]} resulting_spans2 = {(0, 3), (20, 3), (43, 3)} - predictions = md.find_mentions(sample2, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) + predictions = md.find_mentions(sample2, process_sentences, split_docs_value, tagger_ner) predicted_spans = { (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] } diff --git a/tests/test_ed_pipeline.py b/tests/test_ed_pipeline.py index baa500f..10fb7b7 100644 --- a/tests/test_ed_pipeline.py +++ b/tests/test_ed_pipeline.py @@ -28,7 +28,7 @@ def test_pipeline(): tagger = Cmns(base_url, wiki_subfolder, n=5) model = EntityDisambiguation(base_url, wiki_subfolder, config) - mentions_dataset, total_mentions = md.format_spans(sample, tagger_ner_name="flair", process_sentences=True) + mentions_dataset, total_mentions = md.format_spans(sample, process_sentences=True) predictions, _ = model.predict(mentions_dataset) results = process_results( diff --git a/tests/test_flair_md.py b/tests/test_flair_md.py index 5c0d445..4dcbdc2 100644 --- a/tests/test_flair_md.py +++ b/tests/test_flair_md.py @@ -16,7 +16,6 @@ ) def test_md(): # return standard Flair tagger + mention detection object - tagger_ner_name = "flair" tagger_ner = SequenceTagger.load("ner-fast") process_sentences = True split_docs_value = 0 @@ -27,7 +26,7 @@ def test_md(): # first test case: repeating sentences sample1 = {"test_doc": [ "Fox. Fox. Fox." , []] } resulting_spans1 = {(0, 3), (5, 3), (10, 3)} - predictions = md.find_mentions(sample1, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) + predictions = md.find_mentions(sample1, process_sentences, split_docs_value, tagger_ner) predicted_spans = { (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] @@ -37,7 +36,7 @@ def test_md(): # second test case: excessive whitespace sample2 = {"test_doc": ["Fox. Fox. Fox.", []]} resulting_spans2 = {(0, 3), (20, 3), (43, 3)} - predictions = md.find_mentions(sample2, tagger_ner_name, process_sentences, split_docs_value, tagger_ner) + predictions = md.find_mentions(sample2, process_sentences, split_docs_value, tagger_ner) predicted_spans = { (m["pos"], m["end_pos"] - m["pos"]) for m in predictions[0]["test_doc"] } From 6b5b10da8973075159920f2402c0cacb73e8cf17 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 5 Mar 2024 16:12:44 +0100 Subject: [PATCH 47/61] updated combine_entities output format --- src/REL/mention_detection.py | 42 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index d4eb67a..eebd6f1 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -8,6 +8,14 @@ from REL.mention_detection_base import MentionDetectionBase +class Entity: + def __init__(self, text, start_position, end_position, score, tag): + self.text = text + self.start_position = start_position + self.end_position = end_position + self.score = score + self.tag = tag + class MentionDetection(MentionDetectionBase): """ @@ -131,7 +139,7 @@ def split_text(self, dataset, process_sentences, split_docs_value=0, tagger_ner= return res, processed_sentences, splits - def combine_entities(self, ner_results): + def combine_entities(self, ner_results, sentence): ner_results_out = [] i = 0 while i < len(ner_results): @@ -150,8 +158,9 @@ def combine_entities(self, ner_results): last_end = ner_results[i+j]["end"] j += 1 i += j - return ner_results_out - + return [ Entity(sentence[entity["start"]: entity["end"]], entity["start"], entity["end"], entity["score"], entity["entity"]) + for entity in ner_results_out ] + def split_sentence_in_bert_tokens(self, sentence, tagger_ner): tokenizer_results = tagger_ner.tokenizer([sentence], return_offsets_mapping=True) # warns if sentence is too long (>512) @@ -249,26 +258,17 @@ def find_mentions(self, dataset, process_sentences, split_docs_value=0, tagger_n offset = 0 entity_counter = 0 for entity in ( - snt.get_spans("ner") + snt.get_spans("ner") # predict with Flair if is_flair - else self.combine_entities(tagger_ner(snt)) # predict with BERT + else self.combine_entities(tagger_ner(snt), sentence) # predict with BERT ): - if not is_flair: - text, start_pos, end_pos, conf, tag = ( - sentence[entity["start"]:entity["end"]], # for BERT - entity["start"], - entity["end"], - entity["score"], - entity["entity"], - ) - else: - text, start_pos, end_pos, conf, tag = ( - entity.text, # for Flair - entity.start_position, - entity.end_position, - entity.score, - entity.tag, - ) + text, start_pos, end_pos, conf, tag = ( + entity.text, + entity.start_position, + entity.end_position, + entity.score, + entity.tag, + ) total_ment += 1 m = self.preprocess_mention(text) # m may be different from text cands = self.get_candidates(m) From c21e85fa093a587c067957d07867a99cbd0d8f97 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 12:39:23 +0100 Subject: [PATCH 48/61] replaced re.sub by str.removeprefix --- src/REL/mention_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index eebd6f1..8ff43a1 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -151,7 +151,7 @@ def combine_entities(self, ner_results, sentence): re.search("^I", ner_results[i+j]["entity"]) and re.sub("^..", "", ner_results[i+j]["entity"]) == re.sub("^..", "", ner_results[i]["entity"]))): if ner_results[i+j]["start"] == last_end: - ner_results_out[-1]["word"] += re.sub("^##", "", ner_results[i+j]["word"]) + ner_results_out[-1]["word"] += ner_results[i+j]["word"].removeprefix("##") else: ner_results_out[-1]["word"] += " " + ner_results[i+j]["word"] ner_results_out[-1]["end"] = ner_results[i+j]["end"] From bdc149cd771537dd666308565a1f9b7e1e97fa84 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 13:58:37 +0100 Subject: [PATCH 49/61] removed redundant re calls --- src/REL/mention_detection.py | 10 +++++----- src/REL/server.py | 1 - src/scripts/efficiency_test.py | 1 - src/scripts/evaluate_predictions.py | 1 - 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index 8ff43a1..d195b4f 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -1,4 +1,3 @@ -import re import sys from termcolor import colored from flair.data import Sentence @@ -148,8 +147,8 @@ def combine_entities(self, ner_results, sentence): j = 1 while i + j < len(ner_results) and (ner_results[i+j]["start"] == last_end or (ner_results[i+j]["start"] == last_end + 1 and - re.search("^I", ner_results[i+j]["entity"]) and - re.sub("^..", "", ner_results[i+j]["entity"]) == re.sub("^..", "", ner_results[i]["entity"]))): + ner_results[i+j]["entity"].startswith("I") and + ner_results[i+j]["entity"][2:] == ner_results[i]["entity"][2:])): if ner_results[i+j]["start"] == last_end: ner_results_out[-1]["word"] += ner_results[i+j]["word"].removeprefix("##") else: @@ -205,12 +204,13 @@ def split_text_in_parts(self, text, split_docs_value, tagger_ner): def prune_word_internal_mentions(self, raw_text, result_doc, total_ment): + """ remove entities which are part of a larger word """ to_be_deleted = [] for i in range(0, len(result_doc)): start_pos = result_doc[i]["pos"] end_pos = result_doc[i]["end_pos"] - if ((i > 0 and re.search("[a-zA-Z]", raw_text[start_pos-1])) or - (end_pos < len(raw_text) and re.search("[a-zA-Z]", raw_text[end_pos]))): + if ((i > 0 and raw_text[start_pos-1].isalpha()) or + (end_pos < len(raw_text) and raw_text[end_pos].isalpha())): to_be_deleted.append(i) total_ment -= len(to_be_deleted) while len(to_be_deleted) > 0: diff --git a/src/REL/server.py b/src/REL/server.py index ca17761..81d2529 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -1,7 +1,6 @@ import json import numpy import os -import re from http.server import BaseHTTPRequestHandler from flair.models import SequenceTagger diff --git a/src/scripts/efficiency_test.py b/src/scripts/efficiency_test.py index 7c4a334..a3ce630 100644 --- a/src/scripts/efficiency_test.py +++ b/src/scripts/efficiency_test.py @@ -3,7 +3,6 @@ import json import numpy as np import os -import re import requests from REL.ner.set_tagger_ner import set_tagger_ner diff --git a/src/scripts/evaluate_predictions.py b/src/scripts/evaluate_predictions.py index beba193..1276b69 100644 --- a/src/scripts/evaluate_predictions.py +++ b/src/scripts/evaluate_predictions.py @@ -1,5 +1,4 @@ import os -import re UNUSED = -1 From 2e73d0cd257bd1691ef1334084ffca1aeff64395 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 14:15:59 +0100 Subject: [PATCH 50/61] crash without loaded model --- src/REL/ner/bert_wrapper.py | 2 +- src/REL/ner/flair_wrapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/REL/ner/bert_wrapper.py b/src/REL/ner/bert_wrapper.py index ea97844..ea67823 100644 --- a/src/REL/ner/bert_wrapper.py +++ b/src/REL/ner/bert_wrapper.py @@ -7,5 +7,5 @@ def load_bert_ner(path_or_url): model = AutoModelForTokenClassification.from_pretrained(path_or_url) return pipeline("ner", model=model, tokenizer=tokenizer) except Exception: - pass + raise SystemExit(f"cannot load Bert named entity recognition module from {path_or_url}") return diff --git a/src/REL/ner/flair_wrapper.py b/src/REL/ner/flair_wrapper.py index 7a3699d..377b6a2 100644 --- a/src/REL/ner/flair_wrapper.py +++ b/src/REL/ner/flair_wrapper.py @@ -8,5 +8,5 @@ def load_flair_ner(path_or_url): try: return SequenceTagger.load(path_or_url) except Exception: - pass + raise SystemExit(f"cannot load Flair named entity recognition module from {path_or_url}") return SequenceTagger.load(fetch_model(path_or_url, cache_root)) From 3bf20d24c8813c5f636e4b4539f22e926a52630b Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 14:39:57 +0100 Subject: [PATCH 51/61] simplified split_docs_value variable --- src/REL/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/REL/server.py b/src/REL/server.py index 81d2529..eaa3dda 100644 --- a/src/REL/server.py +++ b/src/REL/server.py @@ -186,13 +186,13 @@ def generate_response(self, text, spans): p.add_argument("--port", "-p", default=5555, type=int) p.add_argument("--tagger_ner_name", default="flair", help = "mention detection tagger") p.add_argument("--process_sentences", help = "process sentences rather than documents", action="store_true") - p.add_argument("--split_docs_value", default="0", help = "threshold number of tokens to split document") + p.add_argument("--split_docs_value", action="store", type=int, default=0, help = "threshold number of tokens to split document") args = p.parse_args() tagger_ner_name = args.tagger_ner_name tagger_ner = set_tagger_ner(tagger_ner_name) - split_docs_value = int(args.split_docs_value) + split_docs_value = args.split_docs_value process_sentences = args.process_sentences From ef5c5a9f823aa2e0e4cad33a10e588302bf56c77 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 16:05:37 +0100 Subject: [PATCH 52/61] removed hard-coded paths --- conftest.py | 11 +++++++++++ tests/test_bert_md.py | 3 +-- tests/test_flair_md.py | 3 +-- 3 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 conftest.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..9653aac --- /dev/null +++ b/conftest.py @@ -0,0 +1,11 @@ +import os +import pytest + + +def pytest_addoption(parser): + parser.addoption("--base_url", action="store", default=os.path.dirname(__file__) + "/src/data/") + + +@pytest.fixture +def base_url(request): + return request.config.getoption("--base_url") diff --git a/tests/test_bert_md.py b/tests/test_bert_md.py index 2ea6d08..0565547 100644 --- a/tests/test_bert_md.py +++ b/tests/test_bert_md.py @@ -16,11 +16,10 @@ @pytest.mark.skipif( os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." ) -def test_md(): +def test_md(base_url): tagger_ner = load_bert_ner("dslim/bert-base-NER") process_sentences = False split_docs_value = 0 - base_url = os.path.dirname(__file__) + "/../src/data/" wiki_version = "wiki_2019" md = MentionDetection(base_url, wiki_version) diff --git a/tests/test_flair_md.py b/tests/test_flair_md.py index 4dcbdc2..1311830 100644 --- a/tests/test_flair_md.py +++ b/tests/test_flair_md.py @@ -14,12 +14,11 @@ @pytest.mark.skipif( os.getenv("GITHUB_ACTIONS")=='true', reason="No way of testing this on Github actions." ) -def test_md(): +def test_md(base_url): # return standard Flair tagger + mention detection object tagger_ner = SequenceTagger.load("ner-fast") process_sentences = True split_docs_value = 0 - base_url = os.path.dirname(__file__) + "/../src/data/" wiki_version = "wiki_2019" md = MentionDetection(base_url, wiki_version) From e7304fbfa609170dc0a6b89e73ba7aff434fe3b6 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 17:13:34 +0100 Subject: [PATCH 53/61] enabling manual action run --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 23371b4..433dfc8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,6 +7,8 @@ on: - main pull_request: branches: [ main ] + workflow_dispatch: + jobs: build: From 7cd274f4a7a58db2a80513fa2a05edfb445dc8b1 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 18:43:02 +0100 Subject: [PATCH 54/61] changing python version --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 433dfc8..e02271c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.9] steps: - uses: actions/checkout@v3 From e5009909d36696c5ff46038317f6f7b3c352cb59 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 19 Mar 2024 18:50:13 +0100 Subject: [PATCH 55/61] chnaged pytest arguments --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e02271c..622eefd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -42,4 +42,4 @@ jobs: - name: Test with pytest run: | - pytest -W ignore + pytest tests From 4858dce5eb6e701fe7ee5de72d9116fee6d574c1 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Thu, 28 Mar 2024 18:30:30 +0100 Subject: [PATCH 56/61] fixing merge conflicts --- src/REL/mention_detection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/REL/mention_detection.py b/src/REL/mention_detection.py index d195b4f..fe0e674 100644 --- a/src/REL/mention_detection.py +++ b/src/REL/mention_detection.py @@ -248,10 +248,6 @@ def find_mentions(self, dataset, process_sentences, split_docs_value=0, tagger_n for (idx_sent, (sentence, ground_truth_sentence)), snt in zip( contents.items(), sentences ): - - # Only include offset if using Flair. - # if is_flair: - # 20220607: no always include offset = raw_text.find(sentence, cum_sent_length) if offset < 0: print(colored(f"sentence not found in text: cannot happen: {sentence}", "red"), file=sys.stderr) From 191b08ac2c1f2de21e72c017032495b88a54ca5f Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 8 Apr 2024 14:17:37 +0200 Subject: [PATCH 57/61] corrected incomplete path --- docs/tutorials/custom_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/custom_models.md b/docs/tutorials/custom_models.md index 5dca8c2..373fb95 100644 --- a/docs/tutorials/custom_models.md +++ b/docs/tutorials/custom_models.md @@ -20,7 +20,7 @@ model, you can only use a local filepath. NER and ED models that we provide as part of REL can be loaded easily using aliases. Available models are listed -[on the REL repository](https://github.com/informagi/REL/tree/master/REL/models/models.json). +[on the REL repository](https://github.com/informagi/REL/tree/master/src/REL/models/models.json). All models that need to be downloaded from the web are cached for subsequent use. From 59fb3ae74a66d368f506ac6f5a4cafc37a517c15 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Mon, 8 Apr 2024 15:49:58 +0200 Subject: [PATCH 58/61] added documentation for ner --- docs/tutorials/index.md | 13 +++++++------ docs/tutorials/ner.md | 24 ++++++++++++++++++++++++ src/REL/ner/set_tagger_ner.py | 3 +-- 3 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 docs/tutorials/ner.md diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 3aba1f0..0317754 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -9,9 +9,10 @@ The remainder of the tutorials are optional and for users who wish to e.g. train 1. [How to get started (project folder and structure).](how_to_get_started/) 2. [End-to-End Entity Linking.](e2e_entity_linking/) -3. [Evaluate on GERBIL.](evaluate_gerbil/) -4. [Deploy REL for a new Wikipedia corpus](deploy_REL_new_wiki/): -5. [Reproducing our results](reproducing_our_results/) -6. [REL server](server/) -7. [Notes on using custom models](custom_models/) -7. [Conversational entity linking](conversations/) +3. [Mention Detection models.](ner/) +4. [Evaluate on GERBIL.](evaluate_gerbil/) +5. [Deploy REL for a new Wikipedia corpus](deploy_REL_new_wiki/): +6. [Reproducing our results](reproducing_our_results/) +7. [REL server](server/) +8. [Notes on using custom models](custom_models/) +9. [Conversational entity linking](conversations/) diff --git a/docs/tutorials/ner.md b/docs/tutorials/ner.md new file mode 100644 index 0000000..03b642b --- /dev/null +++ b/docs/tutorials/ner.md @@ -0,0 +1,24 @@ +# Mention Detection models + +REL offers different named entity models for mention detection: + +- `flair`: named model for English, expects upper and lower case text (default) +- `bert_base_cased`: basic name model for English, expects upper and lower case text +- `bert_base_uncased`: basic name model for English, expects lower case text +- `bert_large_cased`: extensive name model for English, expects upper and lower case text +- `bert_large_uncased`: extensive name model for English, expects lower case text +- `bert_multilingual`: multilingual name model, expects upper and lower case text + +To change the default Flair model, specify the required model with the `--tagger_ner_name` option, for example when calling the server: + +```bash +python src/REL/server.py --tagger_ner_name bert_base_cased +``` + +or specify the model in the `tagger_name` parameter of a mention detection call: + +```python +mentions_dataset, n_mentions = mention_detection.find_mentions(docs, tagger_ner="bert_base_cased") +``` + +The available named entity models are specified in the file `src/REL/ner/set_tagger_ner.py`. The file names refer to locations on the website huggingface.co, for example https://huggingface.co/flair/ner-english-fast . The file can be extended with new models, for example for other languages. diff --git a/src/REL/ner/set_tagger_ner.py b/src/REL/ner/set_tagger_ner.py index 363ac7c..9b72f12 100644 --- a/src/REL/ner/set_tagger_ner.py +++ b/src/REL/ner/set_tagger_ner.py @@ -3,9 +3,8 @@ from flair.models import SequenceTagger from REL.ner.bert_wrapper import load_bert_ner - taggers_ner = { - "flair": "ner-fast", + "flair": "flair/ner-english-fast", "bert_base_cased": "dslim/bert-base-NER", "bert_base_uncased": "dslim/bert-base-NER-uncased", "bert_large_cased": "dslim/bert-large-NER", From 19ca5a4161115559648051daf98a60701aae016e Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 9 Apr 2024 14:33:35 +0200 Subject: [PATCH 59/61] restricted scipy version --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c84bf33..cefc92e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ pydantic segtok torch uvicorn +scipy<=1.12.0 From abf41523640b802abd067b6de3c3c10092ea3ad3 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 9 Apr 2024 14:42:33 +0200 Subject: [PATCH 60/61] restricted scipy version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cefc92e..a9025ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ pydantic segtok torch uvicorn -scipy<=1.12.0 +scipy<=1.10 From becdac1a919d09a94b73f35feb939c6e19b7f282 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 9 Apr 2024 15:02:17 +0200 Subject: [PATCH 61/61] restricted scipy version --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 3fb2f46..28a050a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,6 +53,7 @@ install_requires = termcolor syntok spacy + scipy<=1.12.0 [options.extras_require] develop =