84 lines
1.4 KiB
Go
84 lines
1.4 KiB
Go
package parser
|
|
|
|
import (
|
|
"archive/zip"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
)
|
|
|
|
// WordParser extracts text from .docx files
|
|
type WordParser struct{}
|
|
|
|
// SupportedExtensions returns extensions this parser handles
|
|
func (p *WordParser) SupportedExtensions() []string {
|
|
return []string{".docx"}
|
|
}
|
|
|
|
type wDocument struct {
|
|
Body wBody `xml:"body"`
|
|
}
|
|
|
|
type wBody struct {
|
|
P []wP `xml:"p"`
|
|
}
|
|
|
|
type wP struct {
|
|
R []wR `xml:"r"`
|
|
}
|
|
|
|
type wR struct {
|
|
T []string `xml:"t"`
|
|
}
|
|
|
|
// ParseToMarkdown extracts text from the word document and returns it as Markdown-like text
|
|
func (p *WordParser) ParseToMarkdown(path string) (string, error) {
|
|
r, err := zip.OpenReader(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to open docx as zip: %w", err)
|
|
}
|
|
defer r.Close()
|
|
|
|
var docFile *zip.File
|
|
for _, f := range r.File {
|
|
if f.Name == "word/document.xml" {
|
|
docFile = f
|
|
break
|
|
}
|
|
}
|
|
if docFile == nil {
|
|
return "", fmt.Errorf("invalid docx file: word/document.xml not found")
|
|
}
|
|
|
|
rc, err := docFile.Open()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer rc.Close()
|
|
|
|
data, err := io.ReadAll(rc)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var doc wDocument
|
|
if err := xml.Unmarshal(data, &doc); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var md string
|
|
for _, paragraph := range doc.Body.P {
|
|
var pText string
|
|
for _, run := range paragraph.R {
|
|
for _, text := range run.T {
|
|
pText += text
|
|
}
|
|
}
|
|
if pText != "" {
|
|
md += pText + "\n\n"
|
|
}
|
|
}
|
|
|
|
return md, nil
|
|
}
|