package parser import ( "fmt" "os" "strings" "github.com/ledongthuc/pdf" ) // PDFParser extracts plain text from PDF files using ledongthuc/pdf (open source). type PDFParser struct{} func (p *PDFParser) SupportedExtensions() []string { return []string{".pdf"} } func (p *PDFParser) ParseToMarkdown(path string) (string, error) { f, r, err := pdf.Open(path) if err != nil { return "", fmt.Errorf("open pdf: %w", err) } defer f.Close() var sb strings.Builder totalPages := r.NumPage() for i := 1; i <= totalPages; i++ { page := r.Page(i) if page.V.IsNull() { continue } text, err := page.GetPlainText(nil) if err != nil { continue } content := strings.TrimSpace(text) if content != "" { sb.WriteString(fmt.Sprintf("## Page %d\n\n%s\n\n", i, content)) } } if sb.Len() == 0 { // Fallback: try reading entire content at once content, err := readPDFPlainText(path) if err == nil && content != "" { return content, nil } return "", fmt.Errorf("no text content extracted from PDF") } return sb.String(), nil } func readPDFPlainText(path string) (string, error) { f, err := os.Open(path) if err != nil { return "", err } defer f.Close() stat, _ := f.Stat() reader, err := pdf.NewReader(f, stat.Size()) if err != nil { return "", err } var sb strings.Builder for i := 1; i <= reader.NumPage(); i++ { page := reader.Page(i) if page.V.IsNull() { continue } text, _ := page.GetPlainText(nil) sb.WriteString(text) } return sb.String(), nil }