package parser import ( "archive/zip" "encoding/xml" "fmt" "io" ) // WordParser extracts text from .docx files type WordParser struct{} // SupportedExtensions returns extensions this parser handles func (p *WordParser) SupportedExtensions() []string { return []string{".docx"} } type wDocument struct { Body wBody `xml:"body"` } type wBody struct { P []wP `xml:"p"` } type wP struct { R []wR `xml:"r"` } type wR struct { T []string `xml:"t"` } // ParseToMarkdown extracts text from the word document and returns it as Markdown-like text func (p *WordParser) ParseToMarkdown(path string) (string, error) { r, err := zip.OpenReader(path) if err != nil { return "", fmt.Errorf("failed to open docx as zip: %w", err) } defer r.Close() var docFile *zip.File for _, f := range r.File { if f.Name == "word/document.xml" { docFile = f break } } if docFile == nil { return "", fmt.Errorf("invalid docx file: word/document.xml not found") } rc, err := docFile.Open() if err != nil { return "", err } defer rc.Close() data, err := io.ReadAll(rc) if err != nil { return "", err } var doc wDocument if err := xml.Unmarshal(data, &doc); err != nil { return "", err } var md string for _, paragraph := range doc.Body.P { var pText string for _, run := range paragraph.R { for _, text := range run.T { pText += text } } if pText != "" { md += pText + "\n\n" } } return md, nil }