48 lines
1.3 KiB
Python
48 lines
1.3 KiB
Python
"""PDF reading and text extraction utilities."""
|
|
|
|
from pathlib import Path
|
|
from typing import List
|
|
from pypdf import PdfReader
|
|
|
|
|
|
class PDFDocument:
|
|
"""Wrapper for PDF document reading."""
|
|
|
|
def __init__(self, pdf_path: str):
|
|
"""Initialize PDF reader with the given path.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
"""
|
|
self.pdf_path = Path(pdf_path)
|
|
if not self.pdf_path.exists():
|
|
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
|
|
|
self.reader = PdfReader(str(self.pdf_path))
|
|
self.total_pages = len(self.reader.pages)
|
|
|
|
def get_page_text(self, page_number: int) -> str:
|
|
"""Extract text from a specific page.
|
|
|
|
Args:
|
|
page_number: 0-indexed page number
|
|
|
|
Returns:
|
|
Extracted text content from the page
|
|
"""
|
|
if page_number < 0 or page_number >= self.total_pages:
|
|
raise ValueError(
|
|
f"Invalid page number {page_number}. Document has {self.total_pages} pages."
|
|
)
|
|
|
|
page = self.reader.pages[page_number]
|
|
return page.extract_text()
|
|
|
|
def get_all_pages(self) -> List[str]:
|
|
"""Extract text from all pages.
|
|
|
|
Returns:
|
|
List of text content for each page
|
|
"""
|
|
return [self.get_page_text(i) for i in range(self.total_pages)]
|