init: initial commit
This commit is contained in:
@@ -0,0 +1,33 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CADParser is a stub parser for CAD/DWG files.
|
||||
// Real implementation requires CGO/GDAL — reserved for future development.
|
||||
type CADParser struct{}
|
||||
|
||||
func (p *CADParser) SupportedExtensions() []string {
|
||||
return []string{".dwg", ".dxf"}
|
||||
}
|
||||
|
||||
func (p *CADParser) ParseToMarkdown(path string) (string, error) {
|
||||
// Stub: simulate extracting layer names and annotations
|
||||
var sb strings.Builder
|
||||
sb.WriteString("## CAD 图纸解析结果 (模拟)\n\n")
|
||||
sb.WriteString(fmt.Sprintf("**文件**: %s\n\n", path))
|
||||
sb.WriteString("### 图层列表\n\n")
|
||||
sb.WriteString("| 图层名 | 类型 | 元素数 |\n")
|
||||
sb.WriteString("| --- | --- | --- |\n")
|
||||
sb.WriteString("| STR_MAIN | 结构主体 | 142 |\n")
|
||||
sb.WriteString("| DIM_TEXT | 标注文字 | 87 |\n")
|
||||
sb.WriteString("| SEAL_V3 | 密封层 | 23 |\n\n")
|
||||
sb.WriteString("### 标注摘要\n\n")
|
||||
sb.WriteString("- 预留缝宽度: 2.5mm\n")
|
||||
sb.WriteString("- 坐标基点: X=1240, Y=442\n")
|
||||
sb.WriteString("\n> ⚠️ 本解析为模拟结果,完整 CAD 解析需集成 GDAL/LibreDWG。\n")
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/xuri/excelize/v2"
|
||||
)
|
||||
|
||||
// ExcelParser extracts all sheets from Excel files as Markdown tables.
|
||||
type ExcelParser struct{}
|
||||
|
||||
func (p *ExcelParser) SupportedExtensions() []string {
|
||||
return []string{".xlsx", ".xls"}
|
||||
}
|
||||
|
||||
func (p *ExcelParser) ParseToMarkdown(path string) (string, error) {
|
||||
f, err := excelize.OpenFile(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open excel: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var sb strings.Builder
|
||||
for _, sheet := range f.GetSheetList() {
|
||||
rows, err := f.GetRows(sheet)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
sb.WriteString(fmt.Sprintf("## Sheet: %s\n\n", sheet))
|
||||
|
||||
// Header row
|
||||
sb.WriteString("| ")
|
||||
for _, cell := range rows[0] {
|
||||
sb.WriteString(cell + " | ")
|
||||
}
|
||||
sb.WriteString("\n|")
|
||||
for range rows[0] {
|
||||
sb.WriteString(" --- |")
|
||||
}
|
||||
sb.WriteString("\n")
|
||||
|
||||
// Data rows
|
||||
for _, row := range rows[1:] {
|
||||
sb.WriteString("| ")
|
||||
for i := 0; i < len(rows[0]); i++ {
|
||||
if i < len(row) {
|
||||
sb.WriteString(row[i])
|
||||
}
|
||||
sb.WriteString(" | ")
|
||||
}
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Parser interface for all document parsers.
|
||||
type Parser interface {
|
||||
SupportedExtensions() []string
|
||||
ParseToMarkdown(path string) (string, error)
|
||||
}
|
||||
|
||||
// Registry holds registered parsers keyed by extension.
|
||||
type Registry struct {
|
||||
parsers map[string]Parser
|
||||
}
|
||||
|
||||
// NewRegistry creates a parser registry with all built-in parsers.
|
||||
func NewRegistry() *Registry {
|
||||
r := &Registry{parsers: make(map[string]Parser)}
|
||||
r.Register(&ExcelParser{})
|
||||
r.Register(&PDFParser{})
|
||||
r.Register(&CADParser{})
|
||||
r.Register(&WordParser{})
|
||||
return r
|
||||
}
|
||||
|
||||
// Register adds a parser for its supported extensions.
|
||||
func (r *Registry) Register(p Parser) {
|
||||
for _, ext := range p.SupportedExtensions() {
|
||||
r.parsers[strings.ToLower(ext)] = p
|
||||
}
|
||||
}
|
||||
|
||||
// Parse dispatches to the appropriate parser based on file extension.
|
||||
func (r *Registry) Parse(path string) (string, error) {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
p, ok := r.parsers[ext]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("unsupported file type: %s", ext)
|
||||
}
|
||||
return p.ParseToMarkdown(path)
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
// PDFParser extracts plain text from PDF files using ledongthuc/pdf (open source).
|
||||
type PDFParser struct{}
|
||||
|
||||
func (p *PDFParser) SupportedExtensions() []string {
|
||||
return []string{".pdf"}
|
||||
}
|
||||
|
||||
func (p *PDFParser) ParseToMarkdown(path string) (string, error) {
|
||||
f, r, err := pdf.Open(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open pdf: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var sb strings.Builder
|
||||
totalPages := r.NumPage()
|
||||
|
||||
for i := 1; i <= totalPages; i++ {
|
||||
page := r.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
content := strings.TrimSpace(text)
|
||||
if content != "" {
|
||||
sb.WriteString(fmt.Sprintf("## Page %d\n\n%s\n\n", i, content))
|
||||
}
|
||||
}
|
||||
|
||||
if sb.Len() == 0 {
|
||||
// Fallback: try reading entire content at once
|
||||
content, err := readPDFPlainText(path)
|
||||
if err == nil && content != "" {
|
||||
return content, nil
|
||||
}
|
||||
return "", fmt.Errorf("no text content extracted from PDF")
|
||||
}
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
|
||||
func readPDFPlainText(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
stat, _ := f.Stat()
|
||||
reader, err := pdf.NewReader(f, stat.Size())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var sb strings.Builder
|
||||
for i := 1; i <= reader.NumPage(); i++ {
|
||||
page := reader.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, _ := page.GetPlainText(nil)
|
||||
sb.WriteString(text)
|
||||
}
|
||||
return sb.String(), nil
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
|
||||
"github.com/wailsapp/wails/v3/pkg/application"
|
||||
)
|
||||
|
||||
// Job represents a file parsing task.
|
||||
type Job struct {
|
||||
FilePath string
|
||||
FileID string
|
||||
ProjectID string
|
||||
}
|
||||
|
||||
// Result is the outcome of a parsing job.
|
||||
type Result struct {
|
||||
Job Job
|
||||
Content string
|
||||
Err error
|
||||
}
|
||||
|
||||
// ProcessingQueue manages concurrent file parsing with a worker pool.
|
||||
type ProcessingQueue struct {
|
||||
registry *Registry
|
||||
jobs chan Job
|
||||
results chan Result
|
||||
concurrency int
|
||||
wg sync.WaitGroup
|
||||
onComplete func(Result)
|
||||
}
|
||||
|
||||
// NewProcessingQueue creates a queue with the given concurrency limit.
|
||||
func NewProcessingQueue(registry *Registry, concurrency int, onComplete func(Result)) *ProcessingQueue {
|
||||
q := &ProcessingQueue{
|
||||
registry: registry,
|
||||
jobs: make(chan Job, 100),
|
||||
results: make(chan Result, 100),
|
||||
concurrency: concurrency,
|
||||
onComplete: onComplete,
|
||||
}
|
||||
q.start()
|
||||
return q
|
||||
}
|
||||
|
||||
func (q *ProcessingQueue) start() {
|
||||
// Start workers
|
||||
for i := 0; i < q.concurrency; i++ {
|
||||
go q.worker(i)
|
||||
}
|
||||
// Start result collector
|
||||
go func() {
|
||||
for result := range q.results {
|
||||
if q.onComplete != nil {
|
||||
q.onComplete(result)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (q *ProcessingQueue) worker(id int) {
|
||||
for job := range q.jobs {
|
||||
slog.Info("parser worker processing", "worker", id, "file", job.FilePath)
|
||||
content, err := q.registry.Parse(job.FilePath)
|
||||
if err != nil {
|
||||
slog.Error("parse failed", "file", job.FilePath, "err", err)
|
||||
}
|
||||
q.results <- Result{Job: job, Content: content, Err: err}
|
||||
q.wg.Done()
|
||||
}
|
||||
}
|
||||
|
||||
// Submit adds a job to the processing queue.
|
||||
func (q *ProcessingQueue) Submit(job Job) {
|
||||
q.wg.Add(1)
|
||||
q.jobs <- job
|
||||
}
|
||||
|
||||
// Wait blocks until all queued jobs are complete.
|
||||
func (q *ProcessingQueue) Wait() {
|
||||
q.wg.Wait()
|
||||
}
|
||||
|
||||
// Close shuts down the queue.
|
||||
func (q *ProcessingQueue) Close() {
|
||||
close(q.jobs)
|
||||
q.wg.Wait()
|
||||
close(q.results)
|
||||
}
|
||||
|
||||
// ParseService wraps the parsing pipeline for Wails binding.
|
||||
type ParseService struct {
|
||||
registry *Registry
|
||||
queue *ProcessingQueue
|
||||
}
|
||||
|
||||
// NewParseService creates a new parse service.
|
||||
func NewParseService() *ParseService {
|
||||
return &ParseService{
|
||||
registry: NewRegistry(),
|
||||
}
|
||||
}
|
||||
|
||||
// ParseFile synchronously parses a single file. For Wails binding.
|
||||
func (s *ParseService) ParseFile(path string) (string, error) {
|
||||
content, err := s.registry.Parse(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse %s: %w", path, err)
|
||||
}
|
||||
return content, nil
|
||||
}
|
||||
|
||||
// GetSupportedTypes returns supported file extensions.
|
||||
func (s *ParseService) GetSupportedTypes() []string {
|
||||
return []string{".xlsx", ".xls", ".pdf", ".dwg", ".dxf", ".docx"}
|
||||
}
|
||||
|
||||
// ParseDeliveryStandard opens a file dialog to select a document, parses it, and returns the markdown.
|
||||
func (s *ParseService) ParseDeliveryStandard() (string, error) {
|
||||
dialog := application.Get().Dialog.OpenFile()
|
||||
dialog.SetTitle("选择交付标准文件 (Delivery Standard)")
|
||||
dialog.AddFilter("Documents", "*.pdf;*.xlsx;*.xls;*.docx")
|
||||
|
||||
path, err := dialog.PromptForSingleSelection()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open file dialog: %w", err)
|
||||
}
|
||||
if path == "" {
|
||||
return "", nil // user cancelled
|
||||
}
|
||||
return s.ParseFile(path)
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
// WordParser extracts text from .docx files
|
||||
type WordParser struct{}
|
||||
|
||||
// SupportedExtensions returns extensions this parser handles
|
||||
func (p *WordParser) SupportedExtensions() []string {
|
||||
return []string{".docx"}
|
||||
}
|
||||
|
||||
type wDocument struct {
|
||||
Body wBody `xml:"body"`
|
||||
}
|
||||
|
||||
type wBody struct {
|
||||
P []wP `xml:"p"`
|
||||
}
|
||||
|
||||
type wP struct {
|
||||
R []wR `xml:"r"`
|
||||
}
|
||||
|
||||
type wR struct {
|
||||
T []string `xml:"t"`
|
||||
}
|
||||
|
||||
// ParseToMarkdown extracts text from the word document and returns it as Markdown-like text
|
||||
func (p *WordParser) ParseToMarkdown(path string) (string, error) {
|
||||
r, err := zip.OpenReader(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to open docx as zip: %w", err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
var docFile *zip.File
|
||||
for _, f := range r.File {
|
||||
if f.Name == "word/document.xml" {
|
||||
docFile = f
|
||||
break
|
||||
}
|
||||
}
|
||||
if docFile == nil {
|
||||
return "", fmt.Errorf("invalid docx file: word/document.xml not found")
|
||||
}
|
||||
|
||||
rc, err := docFile.Open()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
data, err := io.ReadAll(rc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var doc wDocument
|
||||
if err := xml.Unmarshal(data, &doc); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var md string
|
||||
for _, paragraph := range doc.Body.P {
|
||||
var pText string
|
||||
for _, run := range paragraph.R {
|
||||
for _, text := range run.T {
|
||||
pText += text
|
||||
}
|
||||
}
|
||||
if pText != "" {
|
||||
md += pText + "\n\n"
|
||||
}
|
||||
}
|
||||
|
||||
return md, nil
|
||||
}
|
||||
Reference in New Issue
Block a user