mirror of https://github.com/OpenSPG/KAG
254 lines
8.7 KiB
Python
254 lines
8.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2023 OpenSPG Authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
# in compliance with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
# or implied.
|
|
|
|
import os
|
|
import re
|
|
from typing import List, Sequence, Type, Union
|
|
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
import pdfminer.layout
|
|
|
|
from kag.builder.model.chunk import Chunk
|
|
from kag.interface.builder import SourceReaderABC
|
|
from knext.common.base.runnable import Input, Output
|
|
from kag.builder.prompt.outline_prompt import OutlinePrompt
|
|
|
|
from pdfminer.high_level import extract_text
|
|
from pdfminer.high_level import extract_pages
|
|
from pdfminer.layout import LTTextContainer, LTPage
|
|
from pdfminer.pdfparser import PDFParser
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
from pdfminer.layout import LAParams,LTTextBox
|
|
from pdfminer.pdfpage import PDFPage
|
|
from pdfminer.pdfparser import PDFParser
|
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
from pdfminer.converter import PDFPageAggregator
|
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
import pdfminer
|
|
|
|
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDFReader(SourceReaderABC):
|
|
"""
|
|
A PDF reader class that inherits from SourceReader.
|
|
|
|
Attributes:
|
|
if_split (bool): Whether to split the content by pages. Default is False.
|
|
use_pypdf (bool): Whether to use PyPDF2 for processing PDF files. Default is True.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.split_level = kwargs.get("split_level", 3)
|
|
self.split_using_outline = kwargs.get("split_using_outline", True)
|
|
self.outline_flag = True
|
|
self.llm = self._init_llm()
|
|
language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
|
|
self.prompt = OutlinePrompt(language)
|
|
|
|
|
|
@property
|
|
def input_types(self) -> Type[Input]:
|
|
return str
|
|
|
|
@property
|
|
def output_types(self) -> Type[Output]:
|
|
return Chunk
|
|
|
|
def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk]:
|
|
if isinstance(chunk, Chunk):
|
|
chunk = [chunk]
|
|
outlines = []
|
|
for c in chunk:
|
|
outline = self.llm.invoke({"input": c.content}, self.prompt)
|
|
outlines.extend(outline)
|
|
content = "\n".join([c.content for c in chunk])
|
|
chunks = self.sep_by_outline(content, outlines,basename)
|
|
return chunks
|
|
|
|
def sep_by_outline(self,content,outlines,basename):
|
|
position_check = []
|
|
for outline in outlines:
|
|
start = content.find(outline)
|
|
position_check.append((outline,start))
|
|
chunks = []
|
|
for idx,pc in enumerate(position_check):
|
|
chunk = Chunk(
|
|
id = Chunk.generate_hash_id(f"{basename}#{pc[0]}"),
|
|
name=f"{basename}#{pc[0]}",
|
|
content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)],
|
|
)
|
|
chunks.append(chunk)
|
|
return chunks
|
|
|
|
|
|
|
|
@staticmethod
|
|
def _process_single_page(
|
|
page: str,
|
|
watermark: str,
|
|
remove_header: bool = False,
|
|
remove_footnote: bool = False,
|
|
remove_lists: List[str] = None,
|
|
) -> list:
|
|
"""
|
|
Processes a single page of text, removing headers, footnotes, watermarks, and specified lists.
|
|
|
|
Args:
|
|
page (str): The text content of a single page.
|
|
watermark (str): The watermark text to be removed.
|
|
remove_header (bool): Whether to remove the header. Default is False.
|
|
remove_footnote (bool): Whether to remove the footnote. Default is False.
|
|
remove_lists (List[str]): A list of strings to be removed. Default is None.
|
|
|
|
Returns:
|
|
list: A list of processed text lines.
|
|
"""
|
|
lines = page.split("\n")
|
|
if remove_header and len(lines) > 0:
|
|
lines = lines[1:]
|
|
if remove_footnote and len(lines) > 0:
|
|
lines = lines[:-1]
|
|
|
|
cleaned = [line.strip().replace(watermark, "") for line in lines]
|
|
|
|
if remove_lists is None:
|
|
return cleaned
|
|
for s in remove_lists:
|
|
cleaned = [line.strip().replace(s, "") for line in cleaned]
|
|
|
|
return cleaned
|
|
|
|
@staticmethod
|
|
def _extract_text_from_page(page_layout: LTPage) -> str:
|
|
"""
|
|
Extracts text from a given page layout.
|
|
|
|
Args:
|
|
page_layout (LTPage): The layout of the page containing text elements.
|
|
|
|
Returns:
|
|
str: The extracted text.
|
|
"""
|
|
text = ""
|
|
for element in page_layout:
|
|
if isinstance(element, LTTextContainer):
|
|
text += element.get_text()
|
|
return text
|
|
|
|
def invoke(self, input: str, **kwargs) -> Sequence[Output]:
|
|
"""
|
|
Processes a PDF file, splitting or extracting content based on configuration.
|
|
|
|
Args:
|
|
input (str): The path to the PDF file.
|
|
**kwargs: Additional keyword arguments, such as `clean_list`.
|
|
|
|
Returns:
|
|
Sequence[Output]: A sequence of processed outputs.
|
|
|
|
Raises:
|
|
ValueError: If the file is not a PDF file or the content is empty/unreadable.
|
|
FileNotFoundError: If the file does not exist.
|
|
"""
|
|
if not input.endswith(".pdf"):
|
|
raise ValueError(f"Please provide a pdf file, got {input}")
|
|
|
|
if not os.path.isfile(input):
|
|
raise FileNotFoundError(f"The file {input} does not exist.")
|
|
|
|
|
|
self.fd = open(input, "rb")
|
|
self.parser = PDFParser(self.fd)
|
|
self.document = PDFDocument(self.parser)
|
|
chunks = []
|
|
basename, _ = os.path.splitext(os.path.basename(input))
|
|
|
|
|
|
# get outline
|
|
try:
|
|
outlines = self.document.get_outlines()
|
|
except Exception as e:
|
|
logger.warning(f"loading PDF file: {e}")
|
|
self.outline_flag = False
|
|
|
|
|
|
if not self.outline_flag:
|
|
|
|
with open(input, "rb") as file:
|
|
for idx, page_layout in enumerate(extract_pages(file)):
|
|
content = ""
|
|
for element in page_layout:
|
|
if hasattr(element, "get_text"):
|
|
content = content + element.get_text()
|
|
chunk = Chunk(
|
|
id=Chunk.generate_hash_id(f"{basename}#{idx}"),
|
|
name=f"{basename}#{idx}",
|
|
content=content,
|
|
)
|
|
chunks.append(chunk)
|
|
try:
|
|
outline_chunks = self.outline_chunk(chunks, basename)
|
|
except Exception as e:
|
|
raise RuntimeError(f"Error loading PDF file: {e}")
|
|
if len(outline_chunks) > 0:
|
|
chunks = outline_chunks
|
|
|
|
else:
|
|
split_words = []
|
|
|
|
for item in outlines:
|
|
level, title, dest, a, se = item
|
|
split_words.append(title.strip().replace(" ",""))
|
|
# save the outline position in content
|
|
try:
|
|
text = extract_text(input)
|
|
|
|
except Exception as e:
|
|
raise RuntimeError(f"Error loading PDF file: {e}")
|
|
|
|
cleaned_pages = [
|
|
self._process_single_page(x, "", False, False) for x in text
|
|
]
|
|
sentences = []
|
|
for cleaned_page in cleaned_pages:
|
|
sentences += cleaned_page
|
|
|
|
content = "".join(sentences)
|
|
positions = [(input,0)]
|
|
for split_word in split_words:
|
|
pattern = re.compile(split_word)
|
|
for i,match in enumerate(re.finditer(pattern, content)):
|
|
if i == 1:
|
|
start, end = match.span()
|
|
positions.append((split_word,start))
|
|
|
|
for idx,position in enumerate(positions):
|
|
chunk = Chunk(
|
|
id = Chunk.generate_hash_id(f"{basename}#{position[0]}"),
|
|
name=f"{basename}#{position[0]}",
|
|
content=content[position[1]:positions[idx+1][1] if idx+1 < len(positions) else None],
|
|
)
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
|
|
if __name__ == '__main__':
|
|
reader = PDFReader(split_using_outline=True)
|
|
pdf_path = os.path.join(os.path.dirname(__file__),"../../../../tests/builder/data/aiwen.pdf")
|
|
chunk = reader.invoke(pdf_path)
|
|
print(chunk) |