Files
AI-Writie-Assistant/internal/parser/word_parser.go
T
2026-04-07 17:35:09 +08:00

84 lines
1.4 KiB
Go

package parser
import (
"archive/zip"
"encoding/xml"
"fmt"
"io"
)
// WordParser extracts text from .docx files
type WordParser struct{}
// SupportedExtensions returns extensions this parser handles
func (p *WordParser) SupportedExtensions() []string {
return []string{".docx"}
}
type wDocument struct {
Body wBody `xml:"body"`
}
type wBody struct {
P []wP `xml:"p"`
}
type wP struct {
R []wR `xml:"r"`
}
type wR struct {
T []string `xml:"t"`
}
// ParseToMarkdown extracts text from the word document and returns it as Markdown-like text
func (p *WordParser) ParseToMarkdown(path string) (string, error) {
r, err := zip.OpenReader(path)
if err != nil {
return "", fmt.Errorf("failed to open docx as zip: %w", err)
}
defer r.Close()
var docFile *zip.File
for _, f := range r.File {
if f.Name == "word/document.xml" {
docFile = f
break
}
}
if docFile == nil {
return "", fmt.Errorf("invalid docx file: word/document.xml not found")
}
rc, err := docFile.Open()
if err != nil {
return "", err
}
defer rc.Close()
data, err := io.ReadAll(rc)
if err != nil {
return "", err
}
var doc wDocument
if err := xml.Unmarshal(data, &doc); err != nil {
return "", err
}
var md string
for _, paragraph := range doc.Body.P {
var pText string
for _, run := range paragraph.R {
for _, text := range run.T {
pText += text
}
}
if pText != "" {
md += pText + "\n\n"
}
}
return md, nil
}