init: initial commit
This commit is contained in:
@@ -0,0 +1,76 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
// PDFParser extracts plain text from PDF files using ledongthuc/pdf (open source).
|
||||
type PDFParser struct{}
|
||||
|
||||
func (p *PDFParser) SupportedExtensions() []string {
|
||||
return []string{".pdf"}
|
||||
}
|
||||
|
||||
func (p *PDFParser) ParseToMarkdown(path string) (string, error) {
|
||||
f, r, err := pdf.Open(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open pdf: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var sb strings.Builder
|
||||
totalPages := r.NumPage()
|
||||
|
||||
for i := 1; i <= totalPages; i++ {
|
||||
page := r.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
content := strings.TrimSpace(text)
|
||||
if content != "" {
|
||||
sb.WriteString(fmt.Sprintf("## Page %d\n\n%s\n\n", i, content))
|
||||
}
|
||||
}
|
||||
|
||||
if sb.Len() == 0 {
|
||||
// Fallback: try reading entire content at once
|
||||
content, err := readPDFPlainText(path)
|
||||
if err == nil && content != "" {
|
||||
return content, nil
|
||||
}
|
||||
return "", fmt.Errorf("no text content extracted from PDF")
|
||||
}
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
|
||||
func readPDFPlainText(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
stat, _ := f.Stat()
|
||||
reader, err := pdf.NewReader(f, stat.Size())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var sb strings.Builder
|
||||
for i := 1; i <= reader.NumPage(); i++ {
|
||||
page := reader.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, _ := page.GetPlainText(nil)
|
||||
sb.WriteString(text)
|
||||
}
|
||||
return sb.String(), nil
|
||||
}
|
||||
Reference in New Issue
Block a user