Init
This commit is contained in:
47
src/pdf_to_kcf/pdf_reader.py
Normal file
47
src/pdf_to_kcf/pdf_reader.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""PDF reading and text extraction utilities."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
class PDFDocument:
|
||||
"""Wrapper for PDF document reading."""
|
||||
|
||||
def __init__(self, pdf_path: str):
|
||||
"""Initialize PDF reader with the given path.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
"""
|
||||
self.pdf_path = Path(pdf_path)
|
||||
if not self.pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
||||
|
||||
self.reader = PdfReader(str(self.pdf_path))
|
||||
self.total_pages = len(self.reader.pages)
|
||||
|
||||
def get_page_text(self, page_number: int) -> str:
|
||||
"""Extract text from a specific page.
|
||||
|
||||
Args:
|
||||
page_number: 0-indexed page number
|
||||
|
||||
Returns:
|
||||
Extracted text content from the page
|
||||
"""
|
||||
if page_number < 0 or page_number >= self.total_pages:
|
||||
raise ValueError(
|
||||
f"Invalid page number {page_number}. Document has {self.total_pages} pages."
|
||||
)
|
||||
|
||||
page = self.reader.pages[page_number]
|
||||
return page.extract_text()
|
||||
|
||||
def get_all_pages(self) -> List[str]:
|
||||
"""Extract text from all pages.
|
||||
|
||||
Returns:
|
||||
List of text content for each page
|
||||
"""
|
||||
return [self.get_page_text(i) for i in range(self.total_pages)]
|
||||
Reference in New Issue
Block a user