diff --git a/pipelines/FAQ.md b/pipelines/FAQ.md index 0c199d611fbe..d4ff5c147535 100644 --- a/pipelines/FAQ.md +++ b/pipelines/FAQ.md @@ -221,3 +221,48 @@ click版本过高导致: ``` pip install click==8.0 ``` + +#### 怎么样新增最新的pytorch的检索模型 + +PaddleNLP-Pipelines 提供了可自动将 PyTorch 相关的权重转化为 Paddle 权重的接口,以BAAI/bge-large-zh-v1.5为例,代码如下: + +```python +from paddlenlp.transformers import AutoModel, AutoTokenizer +model = AutoModel.from_pretrained("BAAI/bge-large-zh-v1.5", from_hf_hub=True, convert_from_torch=True) +tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5', from_hf_hub=True) + +model.save_pretrained("BAAI/bge-large-zh-v1.5") +tokenizer.save_pretrained("BAAI/bge-large-zh-v1.5") +``` + +然后在这里像这样注册一下即可使用: + +``` +"BAAI/bge-large-zh-v1.5": { + "task_class": SentenceFeatureExtractionTask, + "task_flag": "feature_extraction-BAAI/bge-large-zh-v1.5", + "task_priority_path": "BAAI/bge-large-zh-v1.5", + }, +``` + +[taskflow 注册地址](https://github.com/PaddlePaddle/PaddleNLP/blob/b6dcb4e19efd85911b13a0fc587fef33578cfebf/paddlenlp/taskflow/taskflow.py#L680) + +使用方式示例如下: + +``` +document_store = FAISSDocumentStore.load(your_index_name) +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="BAAI/bge-large-zh-v1.5", + passage_embedding_model="BAAI/bge-large-zh-v1.5", + output_emb_size=None, + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, + use_gpu=True, + embed_title=False, + pooling_mode="mean_tokens", +) +``` + +**注意** bge-m3的底座模型是XLMRobertaModel,paddlenlp没有实现,不推荐使用。 diff --git a/pipelines/pipelines/document_stores/faiss.py b/pipelines/pipelines/document_stores/faiss.py index 4a55160a5982..8aaf57afc377 100644 --- a/pipelines/pipelines/document_stores/faiss.py +++ b/pipelines/pipelines/document_stores/faiss.py @@ -684,7 +684,7 @@ def _load_init_params_from_config( init_params["index_name"] = [] for index in index_path: faiss_index = faiss.read_index(str(index)) - index_name = str(index).split("/")[-1] + index_name = os.path.basename(str(index)) init_params["index_name"].append(index_name) init_params["faiss_index"][index_name] = faiss_index # Add other init params to override the ones defined in the init params file diff --git a/pipelines/tests/nodes/file_converter/test_pdf.py b/pipelines/tests/nodes/file_converter/test_pdf.py index 0324118fcac4..ab19c93fa306 100644 --- a/pipelines/tests/nodes/file_converter/test_pdf.py +++ b/pipelines/tests/nodes/file_converter/test_pdf.py @@ -25,10 +25,16 @@ def test_conversion(self): expected_result = [ { - "content": " A Simple PDF File \n This is a small demonstration .pdf file - \n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\x0c Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring. More, a little more text. The end, and just as well. ", + "content": " A Simple PDF File \n This is a small demonstration .pdf file - \n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...", "content_type": "text", "meta": None, - } + }, + { + "content": " Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring. More, a little more text. The end, and just as well. ", + "content_type": "text", + "meta": None, + }, ] result = converter.convert(file_path, process_num=1) + self.assertEqual(len(result), 2) self.assertEqual(expected_result, result)