77 lines
1.5 KiB
Go
77 lines
1.5 KiB
Go
package parser
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/ledongthuc/pdf"
|
|
)
|
|
|
|
// PDFParser extracts plain text from PDF files using ledongthuc/pdf (open source).
|
|
type PDFParser struct{}
|
|
|
|
func (p *PDFParser) SupportedExtensions() []string {
|
|
return []string{".pdf"}
|
|
}
|
|
|
|
func (p *PDFParser) ParseToMarkdown(path string) (string, error) {
|
|
f, r, err := pdf.Open(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("open pdf: %w", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
var sb strings.Builder
|
|
totalPages := r.NumPage()
|
|
|
|
for i := 1; i <= totalPages; i++ {
|
|
page := r.Page(i)
|
|
if page.V.IsNull() {
|
|
continue
|
|
}
|
|
text, err := page.GetPlainText(nil)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
content := strings.TrimSpace(text)
|
|
if content != "" {
|
|
sb.WriteString(fmt.Sprintf("## Page %d\n\n%s\n\n", i, content))
|
|
}
|
|
}
|
|
|
|
if sb.Len() == 0 {
|
|
// Fallback: try reading entire content at once
|
|
content, err := readPDFPlainText(path)
|
|
if err == nil && content != "" {
|
|
return content, nil
|
|
}
|
|
return "", fmt.Errorf("no text content extracted from PDF")
|
|
}
|
|
|
|
return sb.String(), nil
|
|
}
|
|
|
|
func readPDFPlainText(path string) (string, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer f.Close()
|
|
stat, _ := f.Stat()
|
|
reader, err := pdf.NewReader(f, stat.Size())
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
var sb strings.Builder
|
|
for i := 1; i <= reader.NumPage(); i++ {
|
|
page := reader.Page(i)
|
|
if page.V.IsNull() {
|
|
continue
|
|
}
|
|
text, _ := page.GetPlainText(nil)
|
|
sb.WriteString(text)
|
|
}
|
|
return sb.String(), nil
|
|
}
|