Files
AI-Writie-Assistant/internal/parser/pdf_parser.go
T
2026-04-07 17:35:09 +08:00

77 lines
1.5 KiB
Go

package parser
import (
"fmt"
"os"
"strings"
"github.com/ledongthuc/pdf"
)
// PDFParser extracts plain text from PDF files using ledongthuc/pdf (open source).
type PDFParser struct{}
func (p *PDFParser) SupportedExtensions() []string {
return []string{".pdf"}
}
func (p *PDFParser) ParseToMarkdown(path string) (string, error) {
f, r, err := pdf.Open(path)
if err != nil {
return "", fmt.Errorf("open pdf: %w", err)
}
defer f.Close()
var sb strings.Builder
totalPages := r.NumPage()
for i := 1; i <= totalPages; i++ {
page := r.Page(i)
if page.V.IsNull() {
continue
}
text, err := page.GetPlainText(nil)
if err != nil {
continue
}
content := strings.TrimSpace(text)
if content != "" {
sb.WriteString(fmt.Sprintf("## Page %d\n\n%s\n\n", i, content))
}
}
if sb.Len() == 0 {
// Fallback: try reading entire content at once
content, err := readPDFPlainText(path)
if err == nil && content != "" {
return content, nil
}
return "", fmt.Errorf("no text content extracted from PDF")
}
return sb.String(), nil
}
func readPDFPlainText(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
stat, _ := f.Stat()
reader, err := pdf.NewReader(f, stat.Size())
if err != nil {
return "", err
}
var sb strings.Builder
for i := 1; i <= reader.NumPage(); i++ {
page := reader.Page(i)
if page.V.IsNull() {
continue
}
text, _ := page.GetPlainText(nil)
sb.WriteString(text)
}
return sb.String(), nil
}