mirror of https://github.com/OpenSPG/KAG
164 lines
5.6 KiB
Python
164 lines
5.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2023 OpenSPG Authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
# in compliance with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
# or implied.
|
|
|
|
import json
|
|
import os
|
|
from typing import List, Type, Dict, Union
|
|
|
|
from kag.builder.component.reader.markdown_reader import MarkDownReader
|
|
from kag.builder.model.chunk import Chunk
|
|
from kag.interface.builder.reader_abc import SourceReaderABC
|
|
from knext.common.base.runnable import Input, Output
|
|
|
|
from kag.common.llm.client import LLMClient
|
|
|
|
|
|
class JSONReader(SourceReaderABC):
|
|
"""
|
|
A class for reading JSON files, inheriting from `SourceReader`.
|
|
Supports converting JSON data into either a list of dictionaries or a list of Chunk objects.
|
|
|
|
Args:
|
|
output_types (Output): Specifies the output type, which can be "Dict" or "Chunk".
|
|
**kwargs: Additional keyword arguments passed to the parent class constructor.
|
|
"""
|
|
|
|
def __init__(self, output_type="Chunk", **kwargs):
|
|
super().__init__(**kwargs)
|
|
if output_type == "Dict":
|
|
self.output_types = Dict[str, str]
|
|
else:
|
|
self.output_types = Chunk
|
|
self.id_col = kwargs.get("id_col", "id")
|
|
self.name_col = kwargs.get("name_col", "name")
|
|
self.content_col = kwargs.get("content_col", "content")
|
|
|
|
@property
|
|
def input_types(self) -> Type[Input]:
|
|
return str
|
|
|
|
@property
|
|
def output_types(self) -> Type[Output]:
|
|
return self._output_types
|
|
|
|
@output_types.setter
|
|
def output_types(self, output_types):
|
|
self._output_types = output_types
|
|
|
|
@staticmethod
|
|
def _read_from_file(file_path: str) -> Union[dict, list]:
|
|
"""
|
|
Safely reads JSON from a file and returns its content.
|
|
|
|
Args:
|
|
file_path (str): The path to the JSON file.
|
|
|
|
Returns:
|
|
Union[dict, list]: The parsed JSON content.
|
|
|
|
Raises:
|
|
ValueError: If there is an error reading the JSON file.
|
|
"""
|
|
try:
|
|
with open(file_path, "r") as file:
|
|
return json.load(file)
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Error reading JSON from file: {e}")
|
|
except FileNotFoundError as e:
|
|
raise ValueError(f"File not found: {e}")
|
|
|
|
@staticmethod
|
|
def _parse_json_string(json_string: str) -> Union[dict, list]:
|
|
"""
|
|
Parses a JSON string and returns its content.
|
|
|
|
Args:
|
|
json_string (str): The JSON string to parse.
|
|
|
|
Returns:
|
|
Union[dict, list]: The parsed JSON content.
|
|
|
|
Raises:
|
|
ValueError: If there is an error parsing the JSON string.
|
|
"""
|
|
try:
|
|
return json.loads(json_string)
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Error parsing JSON string: {e}")
|
|
|
|
def invoke(self, input: str, **kwargs) -> List[Output]:
|
|
"""
|
|
Parses the input string data and generates a list of Chunk objects or returns the original data.
|
|
|
|
This method supports receiving JSON-formatted strings. It extracts specific fields based on provided keyword arguments.
|
|
It can read from a file or directly parse a string. If the input data is in the expected format, it generates a list of Chunk objects;
|
|
otherwise, it throws a ValueError if the input is not a JSON array or object.
|
|
|
|
Args:
|
|
input (str): The input data, which can be a JSON string or a file path.
|
|
**kwargs: Keyword arguments used to specify the field names for ID, name, and content.
|
|
|
|
Returns:
|
|
List[Output]: A list of Chunk objects or the original data.
|
|
|
|
Raises:
|
|
ValueError: If the input data format is incorrect or parsing fails.
|
|
"""
|
|
|
|
id_col = kwargs.get("id_col", "id")
|
|
name_col = kwargs.get("name_col", "name")
|
|
content_col = kwargs.get("content_col", "content")
|
|
self.id_col = id_col
|
|
self.name_col = name_col
|
|
self.content_col = content_col
|
|
try:
|
|
if os.path.exists(input):
|
|
corpus = self._read_from_file(input)
|
|
else:
|
|
corpus = self._parse_json_string(input)
|
|
except ValueError as e:
|
|
raise e
|
|
|
|
if not isinstance(corpus, (list, dict)):
|
|
raise ValueError("Expected input to be a JSON array or object")
|
|
|
|
if isinstance(corpus, dict):
|
|
corpus = [corpus]
|
|
|
|
if self.output_types == Chunk:
|
|
chunks = []
|
|
basename, _ = os.path.splitext(os.path.basename(input))
|
|
for idx, item in enumerate(corpus):
|
|
if not isinstance(item, dict):
|
|
continue
|
|
|
|
chunk = Chunk(
|
|
id=item.get(self.id_col) or Chunk.generate_hash_id(f"{input}#{idx}"),
|
|
name=item.get(self.name_col) or f"{basename}#{idx}",
|
|
content=item.get(self.content_col),
|
|
)
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
else:
|
|
return corpus
|
|
|
|
if __name__ == "__main__":
|
|
reader = JSONReader()
|
|
json_string = '''[
|
|
{
|
|
"title": "test_json",
|
|
"text": "Test content"
|
|
}
|
|
]'''
|
|
chunks = reader.invoke(json_string,name_column="title",content_col = "text")
|
|
res = 1 |