Files
pdf-to-kcf/src/pdf_to_kcf/pdf_reader.py
neutrino2211 b847133df2 Init
2025-12-19 20:41:08 +01:00

48 lines
1.3 KiB
Python

"""PDF reading and text extraction utilities."""
from pathlib import Path
from typing import List
from pypdf import PdfReader
class PDFDocument:
"""Wrapper for PDF document reading."""
def __init__(self, pdf_path: str):
"""Initialize PDF reader with the given path.
Args:
pdf_path: Path to the PDF file
"""
self.pdf_path = Path(pdf_path)
if not self.pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
self.reader = PdfReader(str(self.pdf_path))
self.total_pages = len(self.reader.pages)
def get_page_text(self, page_number: int) -> str:
"""Extract text from a specific page.
Args:
page_number: 0-indexed page number
Returns:
Extracted text content from the page
"""
if page_number < 0 or page_number >= self.total_pages:
raise ValueError(
f"Invalid page number {page_number}. Document has {self.total_pages} pages."
)
page = self.reader.pages[page_number]
return page.extract_text()
def get_all_pages(self) -> List[str]:
"""Extract text from all pages.
Returns:
List of text content for each page
"""
return [self.get_page_text(i) for i in range(self.total_pages)]