KAG/kag/builder/component/reader/txt_reader.py

68 lines
2.2 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
import os
from typing import List, Type
from kag.builder.model.chunk import Chunk
from kag.interface.builder import SourceReaderABC
from knext.common.base.runnable import Input, Output
class TXTReader(SourceReaderABC):
"""
A PDF reader class that inherits from SourceReader.
"""
@property
def input_types(self) -> Type[Input]:
return str
@property
def output_types(self) -> Type[Output]:
return Chunk
def invoke(self, input: Input, **kwargs) -> List[Output]:
"""
The main method for processing text reading. This method reads the content of the input (which can be a file path or text content) and converts it into a Chunk object.
Args:
input (Input): The input string, which can be the path to a text file or direct text content.
**kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
Returns:
List[Output]: A list containing Chunk objects, each representing a piece of text read.
Raises:
ValueError: If the input is empty.
IOError: If there is an issue reading the file specified by the input.
"""
if not input:
raise ValueError("Input cannot be empty")
try:
if os.path.exists(input):
with open(input, "r", encoding='utf-8') as f:
content = f.read()
else:
content = input
except OSError as e:
raise IOError(f"Failed to read file: {input}") from e
basename, _ = os.path.splitext(os.path.basename(input))
chunk = Chunk(
id=Chunk.generate_hash_id(input),
name=basename,
content=content,
)
return [chunk]