diff --git a/kag/examples/README.md b/kag/examples/README.md index 3e629a64..d5582999 100644 --- a/kag/examples/README.md +++ b/kag/examples/README.md @@ -1,172 +1,518 @@ -# Example -## Create project -Create your new knext project from knext cli tool. -* Edit your config file like example.cfg - ```python - [project] - project_name = KagDemo - description = A knext demo project showcasing KAG features. - namespace = KagDemo - project_dir = kagdemo - host_addr = http://localhost:8887 +# KAG Examples - [vectorizer] - type = openai - model = bge-m3 - api_key = EMPTY - base_url = http://127.0.0.1:11434/v1 - vector_dimensions = 1024 +[English](./README.md) | +[简体中文](./README_cn.md) - [llm] - type = ollama - base_url = http://localhost:11434/api/generate - model = llama3.1 +## 1. Precondition - [prompt] - biz_scene = default - language = zh +Please refer to [Quick Start](https://openspg.yuque.com/ndx6g9/cwh47i/rs7gr8g4s538b1n7) to install KAG and its dependency OpenSPG server, and learn about using KAG in developer mode. - [indexer] - with_semantic = False - similarity_threshold = 0.8 +## 2. Create a knowledge base - [retriever] - with_semantic = False - pagerank_threshold = 0.9 - match_threshold = 0.8 - top_k = 10 +### 2.1 Create the project - [log] - level = INFO +#### Step 1: Enter the examples directory - ``` -* Run command: - ```sh - kag project create --config_path example.cfg - ``` -* Put your source data in {project_dir}/builder/data (in the example kagdemo, we put a pdf file in the data directory.) -* Edit and your schema file.(**Optional**) - * Edit the schema in {project_dir}/schema/{project_name}.schema - * Run command in work directory {project_dir}: - ```sh - knext schema commit - ``` -* **Define your BuilderChain**: - * In the {project_dir}/builder/indexer.py, define your BuilderChain by extend the BuilderChainABC class and override the build method. For example: - ```python - import os +```bash +cd kag/examples +``` - from kag.builder.component.reader import DocxReader, PDFReader - from kag.builder.component.splitter import LengthSplitter, OutlineSplitter - from knext.builder.builder_chain_abc import BuilderChainABC - from kag.builder.component.extractor import KAGExtractor - from kag.builder.component.writer import KGWriter - from kag.solver.logic.solver_pipeline import SolverPipeline - import logging - from kag.common.env import init_kag_config +#### Step 2: Edit project configuration + +```bash +vim ./example_config.yaml +``` + +```yaml +#------------project configuration start----------------# +openie_llm: &openie_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +chat_llm: &chat_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +vectorize_model: &vectorize_model + api_key: key + base_url: https://api.siliconflow.cn/v1/ + model: BAAI/bge-m3 + type: openai + vector_dimensions: 1024 +vectorizer: *vectorize_model + +log: + level: INFO + +project: + biz_scene: default + host_addr: http://127.0.0.1:8887 + id: "1" + language: en + namespace: TwoWikiTest +#------------project configuration end----------------# + +#------------kag-builder configuration start----------------# +kag_builder_pipeline: + chain: + type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain + extractor: + type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor + llm: *openie_llm + ner_prompt: + type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt + std_prompt: + type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt + triple_prompt: + type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt + reader: + type: dict_reader # kag.builder.component.reader.dict_reader.DictReader + post_processor: + type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor + similarity_threshold: 0.9 + splitter: + type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter + split_length: 100000 + window_length: 0 + vectorizer: + type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer + vectorize_model: *vectorize_model + writer: + type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter + num_threads_per_chain: 1 + num_chains: 16 + scanner: + type: 2wiki_dataset_scanner # kag.builder.component.scanner.dataset_scanner.MusiqueCorpusScanner +#------------kag-builder configuration end----------------# + +#------------kag-solver configuration start----------------# +search_api: &search_api + type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI + +graph_api: &graph_api + type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi + +exact_kg_retriever: &exact_kg_retriever + type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever + el_num: 5 + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +fuzzy_kg_retriever: &fuzzy_kg_retriever + type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + el_num: 5 + vectorize_model: *vectorize_model + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +chunk_retriever: &chunk_retriever + type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + llm_client: *chat_llm + recall_num: 10 + rerank_topk: 10 + +kag_solver_pipeline: + memory: + type: default_memory # kag.solver.implementation.default_memory.DefaultMemory + llm_client: *chat_llm + max_iterations: 3 + reasoner: + type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner + llm_client: *chat_llm + lf_planner: + type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner + llm_client: *chat_llm + vectorize_model: *vectorize_model + lf_executor: + type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor + llm_client: *chat_llm + force_chunk_retriever: true + exact_kg_retriever: *exact_kg_retriever + fuzzy_kg_retriever: *fuzzy_kg_retriever + chunk_retriever: *chunk_retriever + merger: + type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger + vectorize_model: *vectorize_model + chunk_retriever: *chunk_retriever + generator: + type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator + llm_client: *chat_llm + generate_prompt: + type: resp_simple # kag/examples/2wiki/solver/prompt/resp_generator.py + reflector: + type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector + llm_client: *chat_llm + +#------------kag-solver configuration end----------------# +``` + +Update the generative model configurations ``openie_llm`` and ``chat_llm`` and the representive model configuration ``vectorizer_model`` in the configuration file. + +You need to fill in correct ``api_key``s. If your model providers and model names are different from the default values, you also need to update ``base_url`` and ``model``. + +#### Step 3: Create the project (i.e. knowledge base in product mode) + +```bash +knext project create --config_path ./example_config.yaml +``` + +#### Step 4: Initial contents of the directory + +After creating the project, a directory with the same name as the ``namespace`` field in the ``project`` configuration (e.g., ``TwoWikiTest`` in this example) will be created under the ``kag/examples`` directory, and the KAG framework project code will be initialized. + +Users can modify one or more of the following files to complete the customization of business-specific knowledge graph construction and reasoning-based question answering. + +```text +. +├── builder +│ ├── __init__.py +│ ├── data +│ │ └── __init__.py +│ ├── indexer.py +│ └── prompt +│ └── __init__.py +├── kag_config.yaml +├── reasoner +│ └── __init__.py +├── schema +│ ├── TwoWikiTest.schema +│ └── __init__.py +└── solver + ├── __init__.py + ├── data + │ └── __init__.py + └── prompt + └── __init__.py +``` + +### 2.2 Update the project (Optional) + +If there are configuration changes, you can refer to this section to update the configuration information to the server. + +#### Step 1: Enter the project directory + +```bash +cd kag/examples/TwoWikiTest +``` + +#### Step 2: Edit project configuration + +**Note**: The embedding vectors generated by different representation models can vary significantly. It is recommended not to update the ``vectorizer_model`` configuration after the project is created. If you need to update the ``vectorizer_model`` configuration, please create a new project. + +```bash +vim ./kag_config.yaml +``` + +#### Step 3: Run the update command + +After editing the project configuration, use the ``knext project update`` command to update the local configuration information to the OpenSPG server. + +```bash +knext project update --proj_path . +``` + +## 3. Import documents + +### Step 1: Enter the project directory + +```bash +cd kag/examples/TwoWikiTest +``` + +### Step 2: Retrieve corpus data + +The test corpus data for the 2wiki dataset is located at ``kag/examples/2wiki/builder/data/2wiki_corpus.json``, containing 6,119 documents and 1,000 question-answer pairs. To quickly complete the entire process, there is also a ``2wiki_corpus_sub.json`` file in the same directory, which contains only 7 documents. We will use this smaller dataset as an example for the experiment. + +Copy it to the directory with the same name as the ``TwoWikiTest`` project: + +```bash +cp ../2wiki/builder/data/2wiki_sub_corpus.json builder/data +``` + +### Step 3: Edit the schema (Optional) + +Edit the schema file ``schema/TwoWikiTest.schema``. For an introduction of OpenSPG schema, please refer to [Declarative Schema](https://openspg.yuque.com/ndx6g9/cwh47i/fiq6zum3qtzr7cne). + +### Step 4: Commit the schema to OpenSPG server + +```bash +knext schema commit +``` + +### Step 5: Execute the build task + +Define the build task in the file ``builder/indexer.py``: + +```python +import os +import logging +from kag.common.registry import import_modules_from_path + +from kag.builder.runner import BuilderChainRunner + +logger = logging.getLogger(__name__) - logger = logging.getLogger(__name__) +def buildKB(file_path): + from kag.common.conf import KAG_CONFIG - file_path = os.path.dirname(__file__) + runner = BuilderChainRunner.from_config( + KAG_CONFIG.all_config["kag_builder_pipeline"] + ) + runner.invoke(file_path) - suffix_mapping = { - "docx": DocxReader, - "pdf": PDFReader + logger.info(f"\n\nbuildKB successfully for {file_path}\n\n") + + +if __name__ == "__main__": + import_modules_from_path(".") + dir_path = os.path.dirname(__file__) + # Set file_path to the path of the corpus file prepared earlier + file_path = os.path.join(dir_path, "data/2wiki_sub_corpus.json") + + buildKB(file_path) +``` + +Run the ``indexer.py`` script to complete the knowledge graph construction for unstructured data. + +```bash +cd builder +python indexer.py +``` + +After the build script is started, a checkpoint directory for the task will be generated in the current working directory, recording the checkpoints and statistical information of the build process. + +```text +ckpt +├── chain +├── extractor +├── kag_checkpoint_0_1.ckpt +├── postprocessor +├── reader +└── splitter +``` + +You can view the extraction task statistics, such as how many nodes/edges were extracted from each document, using the following command: + +```bash +less ckpt/kag_checkpoint_0_1.ckpt +``` + +To see how many document entries were successfully written to the graph database, use the following command: + +```bash +wc -l ckpt/kag_checkpoint_0_1.ckpt +``` + +The KAG framework provides checkpoint-based resumption functionality. If the task is interrupted due to a program error or other external factors (e.g., insufficient LLM invocation credits), you can rerun ``indexer.py``. KAG will automatically load the checkpoint file and reuse the existing results. + +### Step 6: Inspect the constructed knowledge graph + +Currently, OpenSPG-KAG provides the [Knowledge Exploration](https://openspg.yuque.com/ndx6g9/cwh47i/mzq74eaynm4rqx4b) capability in product mode, along with the corresponding API documentation [HTTP API Reference](https://openspg.yuque.com/ndx6g9/cwh47i/qvbgge62p7argtd2). + +![KAG Knowledge Inspection Diagram](./images/kag-knowledge-inspection-diag.png) + +## 4. Reasoning-based question answering + +### Step 1: Enter the project directory + +```bash +cd kag/examples/TwoWikiTest +``` + +### Step 2: Retrieve QA data + +The question-answer data corresponding to the ``2wiki_sub_corpus.json`` dataset is located at ``kag/examples/2wiki/solver/data/2wiki_qa_sub.json``, containing two question-answer pairs. Copy it to the directory with the same name as the ``TwoWikiTest`` project. + +```bash +cp ../2wiki/solver/data/2wiki_qa_sub.json solver/data +``` + +### Step 3: Edit the QA script + +```bash +vim ./solver/qa.py +``` + +Paste the following content into ``qa.py`` (or you can directly copy the question-answer script from the built-in 2wiki project). + +```python +import json +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +from tqdm import tqdm + +from kag.common.benchmarks.evaluate import Evaluate +from kag.solver.logic.solver_pipeline import SolverPipeline +from kag.common.conf import KAG_CONFIG +from kag.common.registry import import_modules_from_path + +from kag.common.checkpointer import CheckpointerManager + +logger = logging.getLogger(__name__) + + +class EvaFor2wiki: + """ + init for kag client + """ + + def __init__(self): + pass + + """ + qa from knowledge base, + """ + + def qa(self, query): + resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"]) + answer, traceLog = resp.run(query) + + logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n") + return answer, traceLog + + """ + parallel qa from knowledge base + and getBenchmarks(em, f1, answer_similarity) + """ + + def parallelQaAndEvaluate( + self, qaFilePath, resFilePath, threadNum=1, upperLimit=10 + ): + ckpt = CheckpointerManager.get_checkpointer( + {"type": "zodb", "ckpt_dir": "ckpt"} + ) + + def process_sample(data): + try: + sample_idx, sample = data + sample_id = sample["_id"] + question = sample["question"] + gold = sample["answer"] + if question in ckpt: + print(f"found existing answer to question: {question}") + prediction, traceLog = ckpt.read_from_ckpt(question) + else: + prediction, traceLog = self.qa(question) + ckpt.write_to_ckpt(question, (prediction, traceLog)) + + evalObj = Evaluate() + metrics = evalObj.getBenchMark([prediction], [gold]) + return sample_idx, sample_id, prediction, metrics, traceLog + except Exception as e: + import traceback + + logger.warning( + f"process sample failed with error:{traceback.print_exc()}\nfor: {data}" + ) + return None + + qaList = json.load(open(qaFilePath, "r")) + total_metrics = { + "em": 0.0, + "f1": 0.0, + "answer_similarity": 0.0, + "processNum": 0, } + with ThreadPoolExecutor(max_workers=threadNum) as executor: + futures = [ + executor.submit(process_sample, (sample_idx, sample)) + for sample_idx, sample in enumerate(qaList[:upperLimit]) + ] + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="parallelQaAndEvaluate completing: ", + ): + result = future.result() + if result is not None: + sample_idx, sample_id, prediction, metrics, traceLog = result + sample = qaList[sample_idx] - class KagDemoBuildChain(BuilderChainABC): + sample["prediction"] = prediction + sample["traceLog"] = traceLog + sample["em"] = str(metrics["em"]) + sample["f1"] = str(metrics["f1"]) - def build(self, **kwargs): - file_path = kwargs.get("file_path","a.docx") - suffix = file_path.split(".")[-1] - reader = suffix_mapping[suffix]() - if reader is None: - raise NotImplementedError - length_splitter = LengthSplitter(split_length=8000) - outline_splitter = OutlineSplitter() - project_id = int(os.getenv("KAG_PROJECT_ID")) - extractor = KAGExtractor(project_id=project_id) - writer = KGWriter() + total_metrics["em"] += metrics["em"] + total_metrics["f1"] += metrics["f1"] + total_metrics["answer_similarity"] += metrics["answer_similarity"] + total_metrics["processNum"] += 1 - chain = reader >> length_splitter >> outline_splitter >> extractor >> writer - return chain + if sample_idx % 50 == 0: + with open(resFilePath, "w") as f: + json.dump(qaList, f) - def buildKG(test_file,**kwargs): - chain = KagDemoBuildChain(file_path = test_file) - chain.invoke(test_file, max_workers=10) + with open(resFilePath, "w") as f: + json.dump(qaList, f) + + res_metrics = {} + for item_key, item_value in total_metrics.items(): + if item_key != "processNum": + res_metrics[item_key] = item_value / total_metrics["processNum"] + else: + res_metrics[item_key] = total_metrics["processNum"] + CheckpointerManager.close() + return res_metrics - if __name__ == "__main__": - init_kag_config(os.path.join(file_path,"../../../tests/builder/component/test_config.cfg")) - test_docx = os.path.join(file_path,"../../../tests/builder/data/test_docx.docx") - test_pdf = os.path.join(file_path,"../../../tests/builder/data/KnowledgeGraphTutorialSub.pdf") - buildKG(test_pdf) - ``` - * Build Knowledge Graph by run: - ```sh - python {project_dir}/builder/indexer.py - ``` +if __name__ == "__main__": + import_modules_from_path("./prompt") + evalObj = EvaFor2wiki() -* Question Answer(QA): - * Define SolverPipeline in {project_dir}/solver/evalFor{project_name}.py, for example: - ```python - import logging - import os + start_time = time.time() + filePath = "./data/2wiki_qa_sub.json" - from kag.common.env import init_kag_config - from kag.solver.logic.solver_pipeline import SolverPipeline + evalObj.qa("When did Lothair Ii's mother die?") - logger = logging.getLogger(__name__) + qaFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath) + resFilePath = os.path.join( + os.path.abspath(os.path.dirname(__file__)), f"2wikitest_res_{start_time}.json" + ) + total_metrics = evalObj.parallelQaAndEvaluate( + qaFilePath, resFilePath, threadNum=20, upperLimit=10000 + ) - - class KagDemo: - - """ - init for kag client - """ - - def __init__(self): - pass - - def qa(self, query): - resp = SolverPipeline() - answer, trace_log = resp.run(query) - - return answer,trace_log - - """ - parallel qa from knowledge base - and getBenchmarks(em, f1, answer_similarity) - - """ - - - if __name__ == "__main__": - demo = KagDemo() - query = "什么知识图谱是什么?" - answer,trace_log = demo.qa(query) - print(f"Question: {query}") - print(f"Answer: {answer}") - print(f"TraceLog: {trace_log}") - - ``` - * Run command: - ```sh - python {project_dir}/solver/evaFor{project_name}.py - ``` - - -## Restore project -If you have a project locally but do **not** have in spgserver, **restore** cmd will recover your local project in the spgserver, for example: - - ```sh -knext project recover --host_addr http://{host_ip}:8887 --proj_path {proj_path} + total_metrics["cost"] = time.time() - start_time + with open(f"./2wikitest_metrics_{start_time}.json", "w") as f: + json.dump(total_metrics, f) + print(total_metrics) ``` +### Step 4: Copy the answer generation prompt -## Update project -If you want to update your kag_config.cfg file, you can edit it and use update cmd, for example: -```sh -knext project update --proj_path {proj_path} +Copy the answer generation prompt from the built-in 2wiki project. + +```bash +cp ../2wiki/solver/prompt/resp_generator.py solver/prompt ``` + +### Step 5: Execute the QA task + +```bash +cd solver +python qa.py +``` + +After execution, the QA performance metrics will be printed. + +## 5. Other built-in examples + +You can enter the [kag/examples](.) directory to explore the built-in examples provided in the source code of KAG. + +* [musique](./musique/README.md) (Multi-hop Q&A) +* [twowiki](./2wiki/README.md) (Multi-hop Q&A) +* [hotpotqa](./hotpotqa/README.md) (Multi-hop Q&A) +* [Risk Mining Knowledge Graph](./riskmining/README.md) +* [Enterprise Supply Chain Knowledge Graph](./supplychain/README.md) +* [Medical Knowledge Graph](./medicine/README.md) + diff --git a/kag/examples/README_cn.md b/kag/examples/README_cn.md new file mode 100644 index 00000000..81b8987d --- /dev/null +++ b/kag/examples/README_cn.md @@ -0,0 +1,518 @@ +# KAG 示例 + +[English](./README.md) | +[简体中文](./README_cn.md) + +## 1. 前置条件 + +参考文档 [快速开始](https://openspg.yuque.com/ndx6g9/0.6/quzq24g4esal7q17) 安装 KAG 及其依赖的 OpenSPG server,了解开发者模式 KAG 的使用流程。 + +## 2. 创建知识库 + +### 2.1 新建项目 + +#### Step 1:进入 examples 目录 + +```bash +cd kag/examples +``` + +#### Step 2:编辑项目配置 + +```bash +vim ./example_config.yaml +``` + +```yaml +#------------project configuration start----------------# +openie_llm: &openie_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +chat_llm: &chat_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +vectorize_model: &vectorize_model + api_key: key + base_url: https://api.siliconflow.cn/v1/ + model: BAAI/bge-m3 + type: openai + vector_dimensions: 1024 +vectorizer: *vectorize_model + +log: + level: INFO + +project: + biz_scene: default + host_addr: http://127.0.0.1:8887 + id: "1" + language: en + namespace: TwoWikiTest +#------------project configuration end----------------# + +#------------kag-builder configuration start----------------# +kag_builder_pipeline: + chain: + type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain + extractor: + type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor + llm: *openie_llm + ner_prompt: + type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt + std_prompt: + type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt + triple_prompt: + type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt + reader: + type: dict_reader # kag.builder.component.reader.dict_reader.DictReader + post_processor: + type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor + similarity_threshold: 0.9 + splitter: + type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter + split_length: 100000 + window_length: 0 + vectorizer: + type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer + vectorize_model: *vectorize_model + writer: + type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter + num_threads_per_chain: 1 + num_chains: 16 + scanner: + type: 2wiki_dataset_scanner # kag.builder.component.scanner.dataset_scanner.MusiqueCorpusScanner +#------------kag-builder configuration end----------------# + +#------------kag-solver configuration start----------------# +search_api: &search_api + type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI + +graph_api: &graph_api + type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi + +exact_kg_retriever: &exact_kg_retriever + type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever + el_num: 5 + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +fuzzy_kg_retriever: &fuzzy_kg_retriever + type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + el_num: 5 + vectorize_model: *vectorize_model + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +chunk_retriever: &chunk_retriever + type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + llm_client: *chat_llm + recall_num: 10 + rerank_topk: 10 + +kag_solver_pipeline: + memory: + type: default_memory # kag.solver.implementation.default_memory.DefaultMemory + llm_client: *chat_llm + max_iterations: 3 + reasoner: + type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner + llm_client: *chat_llm + lf_planner: + type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner + llm_client: *chat_llm + vectorize_model: *vectorize_model + lf_executor: + type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor + llm_client: *chat_llm + force_chunk_retriever: true + exact_kg_retriever: *exact_kg_retriever + fuzzy_kg_retriever: *fuzzy_kg_retriever + chunk_retriever: *chunk_retriever + merger: + type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger + vectorize_model: *vectorize_model + chunk_retriever: *chunk_retriever + generator: + type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator + llm_client: *chat_llm + generate_prompt: + type: resp_simple # kag/examples/2wiki/solver/prompt/resp_generator.py + reflector: + type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector + llm_client: *chat_llm + +#------------kag-solver configuration end----------------# +``` + +您需要更新其中的生成模型配置 ``openie_llm`` 和 ``chat_llm`` 和表示模型配置 ``vectorizer_model``。 + +您需要设置正确的 ``api_key``。如果使用的模型供应商和模型名与默认值不同,您还需要更新 ``base_url`` 和 ``model``。 + +#### Step 3:创建项目(与产品模式中的知识库一一对应) + +```bash +knext project create --config_path ./example_config.yaml +``` + +#### Step 4:目录初始化 + +创建项目之后会在 ``kag/examples`` 目录下创建一个与 ``project`` 配置中 ``namespace`` 字段同名的目录(示例中为 ``TwoWikiTest``),并完成 KAG 项目代码框架初始化。 + +用户可以修改下述文件的一个或多个,完成业务自定义图谱构建 & 推理问答。 + +```text +. +├── builder +│ ├── __init__.py +│ ├── data +│ │ └── __init__.py +│ ├── indexer.py +│ └── prompt +│ └── __init__.py +├── kag_config.yaml +├── reasoner +│ └── __init__.py +├── schema +│ ├── TwoWikiTest.schema +│ └── __init__.py +└── solver + ├── __init__.py + ├── data + │ └── __init__.py + └── prompt + └── __init__.py +``` + +### 2.2 更新项目(Optional) + +如果有配置变更,可以参考本节内容,更新配置信息到服务端。 + +#### Step 1:进入项目目录 + +```bash +cd kag/examples/TwoWikiTest +``` + +#### Step 2:编辑项目配置 + +**注意**:由不同表示模型生成的 embedding 向量差异较大,``vectorizer_model`` 配置在项目创建后建议不再更新;如有更新 ``vectorizer_model`` 配置的需求,请创建一个新项目。 + +```bash +vim ./kag_config.yaml +``` + +#### Step 3:运行命令 + +配置修改后,需要使用 ``knext project update`` 命令将本地配置信息更新到 OpenSPG 服务端。 + +```bash +knext project update --proj_path . +``` + +## 3. 导入文档 + +### Step 1:进入项目目录 + +```bash +cd kag/examples/TwoWikiTest +``` + +### Step 2:获取语料数据 + +2wiki 数据集的测试语料数据为 ``kag/examples/2wiki/builder/data/2wiki_corpus.json``,有 6119 篇文档,和 1000 个问答对。为了迅速跑通整个流程,目录下还有一个 ``2wiki_corpus_sub.json`` 文件,只有 7 篇文档,我们以该小规模数据集为例进行试验。 + +将其复制到 ``TwoWikiTest`` 项目的同名目录下: + +```bash +cp ../2wiki/builder/data/2wiki_sub_corpus.json builder/data +``` + +### Step 3:编辑 schema(Optional) + +编辑 ``schema/TwoWikiTest.schema`` 文件,schema 文件格式参考 [声明式 schema](https://openspg.yuque.com/ndx6g9/0.6/fzhov4l2sst6bede) 相关章节。 + +### Step 4:提交 schema 到服务端 + +```bash +knext schema commit +``` + +### Step 5:执行构建任务 + +在 ``builder/indexer.py`` 文件中定义任务构建脚本: + +```python +import os +import logging +from kag.common.registry import import_modules_from_path + +from kag.builder.runner import BuilderChainRunner + +logger = logging.getLogger(__name__) + + +def buildKB(file_path): + from kag.common.conf import KAG_CONFIG + + runner = BuilderChainRunner.from_config( + KAG_CONFIG.all_config["kag_builder_pipeline"] + ) + runner.invoke(file_path) + + logger.info(f"\n\nbuildKB successfully for {file_path}\n\n") + + +if __name__ == "__main__": + import_modules_from_path(".") + dir_path = os.path.dirname(__file__) + # 将 file_path 设置为之前准备好的语料文件路径 + file_path = os.path.join(dir_path, "data/2wiki_sub_corpus.json") + + buildKB(file_path) +``` + +运行 ``indexer.py`` 脚本完成非结构化数据的图谱构建。 + +```bash +cd builder +python indexer.py +``` + +构建脚本启动后,会在当前工作目录下生成任务的 checkpoint 目录,记录了构建链路的 checkpoint 和统计信息。 + +```text +ckpt +├── chain +├── extractor +├── kag_checkpoint_0_1.ckpt +├── postprocessor +├── reader +└── splitter +``` + +通过以下命令查看抽取任务统计信息,如每个文档抽取出多少点 / 边。 + +```bash +less ckpt/kag_checkpoint_0_1.ckpt +``` + +通过以下命令可以查看有多少文档数据被成功写入图数据库。 + +```bash +wc -l ckpt/kag_checkpoint_0_1.ckpt +``` + +KAG 框架基于 checkpoint 文件提供了断点续跑的功能。如果由于程序出错或其他外部原因(如 LLM 余额不足)导致任务中断,可以重新执行 indexer.py,KAG 会自动加载 checkpoint 文件并复用已有结果。 + +### Step 6:结果检查 + +当前,OpenSPG-KAG 在产品端已提供 [知识探查](https://openspg.yuque.com/ndx6g9/0.6/fw4ge5c18tyfl2yq) 能力,以及对应的 API 文档 [HTTP API Reference](https://openspg.yuque.com/ndx6g9/0.6/zde1yunbb8sncxtv)。 + +![KAG Knowledge Inspection Diagram](./images/kag-knowledge-inspection-diag.png) + +## 4. 推理问答 + +### Step 1:进入项目目录 + +```bash +cd kag/examples/TwoWikiTest +``` + +### Step 2:获取问答数据 + +与 ``2wiki_sub_corpus.json`` 数据集对应的问答数据为 ``kag/examples/2wiki/solver/data/2wiki_qa_sub.json``,有两个问答对。将其复制到 ``TwoWikiTest`` 项目的同名目录下。 + +```bash +cp ../2wiki/solver/data/2wiki_qa_sub.json solver/data +``` + +### Step 3:编写问答脚本 + +```bash +vim ./solver/qa.py +``` + +将以下内容粘贴到 ``qa.py`` 中(也可直接复制内置的 2wiki 项目的问答脚本)。 + +```python +import json +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +from tqdm import tqdm + +from kag.common.benchmarks.evaluate import Evaluate +from kag.solver.logic.solver_pipeline import SolverPipeline +from kag.common.conf import KAG_CONFIG +from kag.common.registry import import_modules_from_path + +from kag.common.checkpointer import CheckpointerManager + +logger = logging.getLogger(__name__) + + +class EvaFor2wiki: + """ + init for kag client + """ + + def __init__(self): + pass + + """ + qa from knowledge base, + """ + + def qa(self, query): + resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"]) + answer, traceLog = resp.run(query) + + logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n") + return answer, traceLog + + """ + parallel qa from knowledge base + and getBenchmarks(em, f1, answer_similarity) + """ + + def parallelQaAndEvaluate( + self, qaFilePath, resFilePath, threadNum=1, upperLimit=10 + ): + ckpt = CheckpointerManager.get_checkpointer( + {"type": "zodb", "ckpt_dir": "ckpt"} + ) + + def process_sample(data): + try: + sample_idx, sample = data + sample_id = sample["_id"] + question = sample["question"] + gold = sample["answer"] + if question in ckpt: + print(f"found existing answer to question: {question}") + prediction, traceLog = ckpt.read_from_ckpt(question) + else: + prediction, traceLog = self.qa(question) + ckpt.write_to_ckpt(question, (prediction, traceLog)) + + evalObj = Evaluate() + metrics = evalObj.getBenchMark([prediction], [gold]) + return sample_idx, sample_id, prediction, metrics, traceLog + except Exception as e: + import traceback + + logger.warning( + f"process sample failed with error:{traceback.print_exc()}\nfor: {data}" + ) + return None + + qaList = json.load(open(qaFilePath, "r")) + total_metrics = { + "em": 0.0, + "f1": 0.0, + "answer_similarity": 0.0, + "processNum": 0, + } + with ThreadPoolExecutor(max_workers=threadNum) as executor: + futures = [ + executor.submit(process_sample, (sample_idx, sample)) + for sample_idx, sample in enumerate(qaList[:upperLimit]) + ] + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="parallelQaAndEvaluate completing: ", + ): + result = future.result() + if result is not None: + sample_idx, sample_id, prediction, metrics, traceLog = result + sample = qaList[sample_idx] + + sample["prediction"] = prediction + sample["traceLog"] = traceLog + sample["em"] = str(metrics["em"]) + sample["f1"] = str(metrics["f1"]) + + total_metrics["em"] += metrics["em"] + total_metrics["f1"] += metrics["f1"] + total_metrics["answer_similarity"] += metrics["answer_similarity"] + total_metrics["processNum"] += 1 + + if sample_idx % 50 == 0: + with open(resFilePath, "w") as f: + json.dump(qaList, f) + + with open(resFilePath, "w") as f: + json.dump(qaList, f) + + res_metrics = {} + for item_key, item_value in total_metrics.items(): + if item_key != "processNum": + res_metrics[item_key] = item_value / total_metrics["processNum"] + else: + res_metrics[item_key] = total_metrics["processNum"] + CheckpointerManager.close() + return res_metrics + + +if __name__ == "__main__": + import_modules_from_path("./prompt") + evalObj = EvaFor2wiki() + + start_time = time.time() + filePath = "./data/2wiki_qa_sub.json" + + evalObj.qa("When did Lothair Ii's mother die?") + + qaFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath) + resFilePath = os.path.join( + os.path.abspath(os.path.dirname(__file__)), f"2wikitest_res_{start_time}.json" + ) + total_metrics = evalObj.parallelQaAndEvaluate( + qaFilePath, resFilePath, threadNum=20, upperLimit=10000 + ) + + total_metrics["cost"] = time.time() - start_time + with open(f"./2wikitest_metrics_{start_time}.json", "w") as f: + json.dump(total_metrics, f) + print(total_metrics) +``` + +### Step 4:复制答案生成 prompt + +复制 2wiki 项目的答案生成 prompt。 + +```bash +cp ../2wiki/solver/prompt/resp_generator.py solver/prompt +``` + +### Step 5:运行命令 + +```bash +cd solver +python qa.py +``` + +执行结束后,会打印 QA 的效果指标。 + +## 5. 其他内置案例 + +可进入 [kag/examples](.) 目录体验源码中自带的案例。 + +* [musique](./musique/README_cn.md)(多跳问答) +* [twowiki](./2wiki/README_cn.md)(多跳问答) +* [hotpotqa](./hotpotqa/README_cn.md)(多跳问答) +* [黑产挖掘](./riskmining/README_cn.md) +* [企业供应链](./supplychain/README_cn.md) +* [医疗图谱](./medicine/README_cn.md) + diff --git a/kag/examples/images/kag-knowledge-inspection-diag.png b/kag/examples/images/kag-knowledge-inspection-diag.png new file mode 100644 index 00000000..5b2d99e4 Binary files /dev/null and b/kag/examples/images/kag-knowledge-inspection-diag.png differ diff --git a/kag/templates/project/solver/data/__init__.py b/kag/templates/project/solver/data/__init__.py new file mode 100644 index 00000000..59bacd4d --- /dev/null +++ b/kag/templates/project/solver/data/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +""" +Place the files to be used for building the index in this directory. +"""