init: initial commit
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
// WordParser extracts text from .docx files
|
||||
type WordParser struct{}
|
||||
|
||||
// SupportedExtensions returns extensions this parser handles
|
||||
func (p *WordParser) SupportedExtensions() []string {
|
||||
return []string{".docx"}
|
||||
}
|
||||
|
||||
type wDocument struct {
|
||||
Body wBody `xml:"body"`
|
||||
}
|
||||
|
||||
type wBody struct {
|
||||
P []wP `xml:"p"`
|
||||
}
|
||||
|
||||
type wP struct {
|
||||
R []wR `xml:"r"`
|
||||
}
|
||||
|
||||
type wR struct {
|
||||
T []string `xml:"t"`
|
||||
}
|
||||
|
||||
// ParseToMarkdown extracts text from the word document and returns it as Markdown-like text
|
||||
func (p *WordParser) ParseToMarkdown(path string) (string, error) {
|
||||
r, err := zip.OpenReader(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to open docx as zip: %w", err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
var docFile *zip.File
|
||||
for _, f := range r.File {
|
||||
if f.Name == "word/document.xml" {
|
||||
docFile = f
|
||||
break
|
||||
}
|
||||
}
|
||||
if docFile == nil {
|
||||
return "", fmt.Errorf("invalid docx file: word/document.xml not found")
|
||||
}
|
||||
|
||||
rc, err := docFile.Open()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
data, err := io.ReadAll(rc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var doc wDocument
|
||||
if err := xml.Unmarshal(data, &doc); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var md string
|
||||
for _, paragraph := range doc.Body.P {
|
||||
var pText string
|
||||
for _, run := range paragraph.R {
|
||||
for _, text := range run.T {
|
||||
pText += text
|
||||
}
|
||||
}
|
||||
if pText != "" {
|
||||
md += pText + "\n\n"
|
||||
}
|
||||
}
|
||||
|
||||
return md, nil
|
||||
}
|
||||
Reference in New Issue
Block a user