KAG/kag/builder/component/reader/pdf_reader.py

254 lines
8.7 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
import os
import re
from typing import List, Sequence, Type, Union
from langchain_community.document_loaders import PyPDFLoader
import pdfminer.layout
from kag.builder.model.chunk import Chunk
from kag.interface.builder import SourceReaderABC
from knext.common.base.runnable import Input, Output
from kag.builder.prompt.outline_prompt import OutlinePrompt
from pdfminer.high_level import extract_text
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.layout import LAParams,LTTextBox
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
import pdfminer
import logging
logger = logging.getLogger(__name__)
class PDFReader(SourceReaderABC):
"""
A PDF reader class that inherits from SourceReader.
Attributes:
if_split (bool): Whether to split the content by pages. Default is False.
use_pypdf (bool): Whether to use PyPDF2 for processing PDF files. Default is True.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.split_level = kwargs.get("split_level", 3)
self.split_using_outline = kwargs.get("split_using_outline", True)
self.outline_flag = True
self.llm = self._init_llm()
language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
self.prompt = OutlinePrompt(language)
@property
def input_types(self) -> Type[Input]:
return str
@property
def output_types(self) -> Type[Output]:
return Chunk
def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk]:
if isinstance(chunk, Chunk):
chunk = [chunk]
outlines = []
for c in chunk:
outline = self.llm.invoke({"input": c.content}, self.prompt)
outlines.extend(outline)
content = "\n".join([c.content for c in chunk])
chunks = self.sep_by_outline(content, outlines,basename)
return chunks
def sep_by_outline(self,content,outlines,basename):
position_check = []
for outline in outlines:
start = content.find(outline)
position_check.append((outline,start))
chunks = []
for idx,pc in enumerate(position_check):
chunk = Chunk(
id = Chunk.generate_hash_id(f"{basename}#{pc[0]}"),
name=f"{basename}#{pc[0]}",
content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)],
)
chunks.append(chunk)
return chunks
@staticmethod
def _process_single_page(
page: str,
watermark: str,
remove_header: bool = False,
remove_footnote: bool = False,
remove_lists: List[str] = None,
) -> list:
"""
Processes a single page of text, removing headers, footnotes, watermarks, and specified lists.
Args:
page (str): The text content of a single page.
watermark (str): The watermark text to be removed.
remove_header (bool): Whether to remove the header. Default is False.
remove_footnote (bool): Whether to remove the footnote. Default is False.
remove_lists (List[str]): A list of strings to be removed. Default is None.
Returns:
list: A list of processed text lines.
"""
lines = page.split("\n")
if remove_header and len(lines) > 0:
lines = lines[1:]
if remove_footnote and len(lines) > 0:
lines = lines[:-1]
cleaned = [line.strip().replace(watermark, "") for line in lines]
if remove_lists is None:
return cleaned
for s in remove_lists:
cleaned = [line.strip().replace(s, "") for line in cleaned]
return cleaned
@staticmethod
def _extract_text_from_page(page_layout: LTPage) -> str:
"""
Extracts text from a given page layout.
Args:
page_layout (LTPage): The layout of the page containing text elements.
Returns:
str: The extracted text.
"""
text = ""
for element in page_layout:
if isinstance(element, LTTextContainer):
text += element.get_text()
return text
def invoke(self, input: str, **kwargs) -> Sequence[Output]:
"""
Processes a PDF file, splitting or extracting content based on configuration.
Args:
input (str): The path to the PDF file.
**kwargs: Additional keyword arguments, such as `clean_list`.
Returns:
Sequence[Output]: A sequence of processed outputs.
Raises:
ValueError: If the file is not a PDF file or the content is empty/unreadable.
FileNotFoundError: If the file does not exist.
"""
if not input.endswith(".pdf"):
raise ValueError(f"Please provide a pdf file, got {input}")
if not os.path.isfile(input):
raise FileNotFoundError(f"The file {input} does not exist.")
self.fd = open(input, "rb")
self.parser = PDFParser(self.fd)
self.document = PDFDocument(self.parser)
chunks = []
basename, _ = os.path.splitext(os.path.basename(input))
# get outline
try:
outlines = self.document.get_outlines()
except Exception as e:
logger.warning(f"loading PDF file: {e}")
self.outline_flag = False
if not self.outline_flag:
with open(input, "rb") as file:
for idx, page_layout in enumerate(extract_pages(file)):
content = ""
for element in page_layout:
if hasattr(element, "get_text"):
content = content + element.get_text()
chunk = Chunk(
id=Chunk.generate_hash_id(f"{basename}#{idx}"),
name=f"{basename}#{idx}",
content=content,
)
chunks.append(chunk)
try:
outline_chunks = self.outline_chunk(chunks, basename)
except Exception as e:
raise RuntimeError(f"Error loading PDF file: {e}")
if len(outline_chunks) > 0:
chunks = outline_chunks
else:
split_words = []
for item in outlines:
level, title, dest, a, se = item
split_words.append(title.strip().replace(" ",""))
# save the outline position in content
try:
text = extract_text(input)
except Exception as e:
raise RuntimeError(f"Error loading PDF file: {e}")
cleaned_pages = [
self._process_single_page(x, "", False, False) for x in text
]
sentences = []
for cleaned_page in cleaned_pages:
sentences += cleaned_page
content = "".join(sentences)
positions = [(input,0)]
for split_word in split_words:
pattern = re.compile(split_word)
for i,match in enumerate(re.finditer(pattern, content)):
if i == 1:
start, end = match.span()
positions.append((split_word,start))
for idx,position in enumerate(positions):
chunk = Chunk(
id = Chunk.generate_hash_id(f"{basename}#{position[0]}"),
name=f"{basename}#{position[0]}",
content=content[position[1]:positions[idx+1][1] if idx+1 < len(positions) else None],
)
chunks.append(chunk)
return chunks
if __name__ == '__main__':
reader = PDFReader(split_using_outline=True)
pdf_path = os.path.join(os.path.dirname(__file__),"../../../../tests/builder/data/aiwen.pdf")
chunk = reader.invoke(pdf_path)
print(chunk)