"""PDF reading and text extraction utilities.""" from pathlib import Path from typing import List from pypdf import PdfReader class PDFDocument: """Wrapper for PDF document reading.""" def __init__(self, pdf_path: str): """Initialize PDF reader with the given path. Args: pdf_path: Path to the PDF file """ self.pdf_path = Path(pdf_path) if not self.pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}") self.reader = PdfReader(str(self.pdf_path)) self.total_pages = len(self.reader.pages) def get_page_text(self, page_number: int) -> str: """Extract text from a specific page. Args: page_number: 0-indexed page number Returns: Extracted text content from the page """ if page_number < 0 or page_number >= self.total_pages: raise ValueError( f"Invalid page number {page_number}. Document has {self.total_pages} pages." ) page = self.reader.pages[page_number] return page.extract_text() def get_all_pages(self) -> List[str]: """Extract text from all pages. Returns: List of text content for each page """ return [self.get_page_text(i) for i in range(self.total_pages)]