refactor: excel parse
This commit is contained in:
@@ -0,0 +1,868 @@
|
||||
# Copyright (c) 2020-2023, Manfred Moitzi
|
||||
# License: MIT License
|
||||
from __future__ import annotations
|
||||
|
||||
import string
|
||||
import typing
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
BinaryIO,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Callable,
|
||||
Union,
|
||||
Optional,
|
||||
)
|
||||
import itertools
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
from ezdxf.lldxf import const
|
||||
from ezdxf.lldxf import repair
|
||||
from ezdxf.lldxf.encoding import (
|
||||
has_dxf_unicode,
|
||||
decode_dxf_unicode,
|
||||
has_mif_encoding,
|
||||
decode_mif_to_unicode,
|
||||
)
|
||||
from ezdxf.lldxf.types import (
|
||||
DXFTag,
|
||||
DXFVertex,
|
||||
DXFBinaryTag,
|
||||
POINT_CODES,
|
||||
BINARY_DATA,
|
||||
TYPE_TABLE,
|
||||
MAX_GROUP_CODE,
|
||||
)
|
||||
from ezdxf.lldxf.tags import group_tags, Tags
|
||||
from ezdxf.lldxf.validator import entity_structure_validator
|
||||
from ezdxf.tools.codepage import toencoding
|
||||
from ezdxf.audit import Auditor, AuditError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ezdxf.document import Drawing
|
||||
from ezdxf.eztypes import SectionDict
|
||||
|
||||
__all__ = ["read", "readfile"]
|
||||
|
||||
EXCLUDE_STRUCTURE_CHECK = {
|
||||
"SECTION",
|
||||
"ENDSEC",
|
||||
"EOF",
|
||||
"TABLE",
|
||||
"ENDTAB",
|
||||
"ENDBLK",
|
||||
"SEQEND",
|
||||
}
|
||||
logger = logging.getLogger("ezdxf")
|
||||
|
||||
|
||||
def readfile(
|
||||
filename: Union[str, Path], errors: str = "surrogateescape"
|
||||
) -> tuple[Drawing, Auditor]:
|
||||
"""Read a DXF document from file system similar to :func:`ezdxf.readfile`,
|
||||
but this function will repair as many flaws as possible, runs the required
|
||||
audit process automatically the DXF document and the :class:`Auditor`.
|
||||
|
||||
Args:
|
||||
filename: file-system name of the DXF document to load
|
||||
errors: specify decoding error handler
|
||||
|
||||
- "surrogateescape" to preserve possible binary data (default)
|
||||
- "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data
|
||||
- "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data
|
||||
|
||||
Raises:
|
||||
DXFStructureError: for invalid or corrupted DXF structures
|
||||
UnicodeDecodeError: if `errors` is "strict" and a decoding error occurs
|
||||
|
||||
"""
|
||||
filename = str(filename)
|
||||
with open(filename, mode="rb") as fp:
|
||||
doc, auditor = read(fp, errors=errors)
|
||||
doc.filename = filename
|
||||
return doc, auditor
|
||||
|
||||
|
||||
def read(stream: BinaryIO, errors: str = "surrogateescape") -> tuple[Drawing, Auditor]:
|
||||
"""Read a DXF document from a binary-stream similar to :func:`ezdxf.read`,
|
||||
but this function will detect the text encoding automatically and repair
|
||||
as many flaws as possible, runs the required audit process afterwards
|
||||
and returns the DXF document and the :class:`Auditor`.
|
||||
|
||||
Args:
|
||||
stream: data stream to load in binary read mode
|
||||
errors: specify decoding error handler
|
||||
|
||||
- "surrogateescape" to preserve possible binary data (default)
|
||||
- "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data
|
||||
- "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data
|
||||
|
||||
Raises:
|
||||
DXFStructureError: for invalid or corrupted DXF structures
|
||||
UnicodeDecodeError: if `errors` is "strict" and a decoding error occurs
|
||||
|
||||
"""
|
||||
recover_tool = Recover.run(stream, errors=errors)
|
||||
return _load_and_audit_document(recover_tool)
|
||||
|
||||
|
||||
def explore(
|
||||
filename: Union[str, Path], errors: str = "ignore"
|
||||
) -> tuple[Drawing, Auditor]:
|
||||
"""Read a DXF document from file system similar to :func:`readfile`,
|
||||
but this function will use a special tag loader, which tries to recover the
|
||||
tag stream if invalid tags occur. This function is intended to load
|
||||
corrupted DXF files and should only be used to explore such files, data loss
|
||||
is very likely.
|
||||
|
||||
Args:
|
||||
filename: file-system name of the DXF document to load
|
||||
errors: specify decoding error handler
|
||||
|
||||
- "surrogateescape" to preserve possible binary data (default)
|
||||
- "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data
|
||||
- "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data
|
||||
|
||||
Raises:
|
||||
DXFStructureError: for invalid or corrupted DXF structures
|
||||
UnicodeDecodeError: if `errors` is "strict" and a decoding error occurs
|
||||
|
||||
"""
|
||||
filename = str(filename)
|
||||
with open(filename, mode="rb") as fp:
|
||||
recover_tool = Recover.run(fp, errors=errors, loader=synced_bytes_loader)
|
||||
doc, auditor = _load_and_audit_document(recover_tool)
|
||||
doc.filename = filename
|
||||
return doc, auditor
|
||||
|
||||
|
||||
def _load_and_audit_document(recover_tool) -> tuple[Drawing, Auditor]:
|
||||
from ezdxf.document import Drawing
|
||||
|
||||
doc = Drawing()
|
||||
doc._load_section_dict(recover_tool.section_dict)
|
||||
|
||||
auditor = Auditor(doc)
|
||||
for code, msg in recover_tool.errors:
|
||||
auditor.add_error(code, msg)
|
||||
for code, msg in recover_tool.fixes:
|
||||
auditor.fixed_error(code, msg)
|
||||
auditor.run()
|
||||
return doc, auditor
|
||||
|
||||
|
||||
# noinspection PyMethodMayBeStatic
|
||||
class Recover:
|
||||
"""Loose coupled recovering tools."""
|
||||
|
||||
def __init__(self, loader: Optional[Callable] = None):
|
||||
# different tag loading strategies can be used:
|
||||
# - bytes_loader(): expects a valid low level structure
|
||||
# - synced_bytes_loader(): loads everything which looks like a tag
|
||||
# and skip other content (dangerous!)
|
||||
self.tag_loader = loader or bytes_loader
|
||||
|
||||
# The main goal of all efforts, a Drawing compatible dict of sections:
|
||||
self.section_dict: "SectionDict" = dict()
|
||||
|
||||
# Store error messages from low level processes
|
||||
self.errors: list[tuple[int, str]] = []
|
||||
self.fixes: list[tuple[int, str]] = []
|
||||
|
||||
# Detected DXF version
|
||||
self.dxfversion = const.DXF12
|
||||
|
||||
@classmethod
|
||||
def run(
|
||||
cls,
|
||||
stream: BinaryIO,
|
||||
loader: Optional[Callable] = None,
|
||||
errors: str = "surrogateescape",
|
||||
) -> Recover:
|
||||
"""Execute the recover process."""
|
||||
recover_tool = Recover(loader)
|
||||
tags = recover_tool.load_tags(stream, errors)
|
||||
sections = recover_tool.rebuild_sections(tags)
|
||||
recover_tool.load_section_dict(sections)
|
||||
tables = recover_tool.section_dict.get("TABLES")
|
||||
if tables:
|
||||
tables = recover_tool.rebuild_tables(tables) # type: ignore
|
||||
recover_tool.section_dict["TABLES"] = tables
|
||||
if recover_tool.dxfversion > "AC1009":
|
||||
recover_tool.recover_rootdict()
|
||||
recover_tool.fix_broken_layout_links()
|
||||
section_dict = recover_tool.section_dict
|
||||
|
||||
is_r12 = recover_tool.dxfversion <= "AC1009"
|
||||
for name, entities in section_dict.items():
|
||||
if name in {"TABLES", "BLOCKS", "OBJECTS", "ENTITIES"}:
|
||||
section_dict[name] = list(
|
||||
recover_tool.check_entities(entities, is_r12) # type: ignore
|
||||
)
|
||||
|
||||
return recover_tool
|
||||
|
||||
def load_tags(self, stream: BinaryIO, errors: str) -> Iterator[DXFTag]:
|
||||
return safe_tag_loader(
|
||||
stream, self.tag_loader, messages=self.errors, errors=errors
|
||||
)
|
||||
|
||||
def rebuild_sections(self, tags: Iterable[DXFTag]) -> list[list[DXFTag]]:
|
||||
"""Collect tags between SECTION and ENDSEC or next SECTION tag
|
||||
as list of DXFTag objects, collects tags outside of sections
|
||||
as an extra section.
|
||||
|
||||
Returns:
|
||||
List of sections as list of DXFTag() objects, the last section
|
||||
contains orphaned tags found outside of sections
|
||||
|
||||
"""
|
||||
|
||||
# Invalid placed DXF entities are removed in the audit process!
|
||||
def close_section():
|
||||
# ENDSEC tag is not collected
|
||||
nonlocal collector, inside_section
|
||||
if inside_section:
|
||||
sections.append(collector)
|
||||
else: # missing SECTION
|
||||
# ignore this tag, it is even not an orphan
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.MISSING_SECTION_TAG,
|
||||
"DXF structure error: missing SECTION tag.",
|
||||
)
|
||||
)
|
||||
collector = []
|
||||
inside_section = False
|
||||
|
||||
def open_section():
|
||||
nonlocal inside_section
|
||||
if inside_section: # missing ENDSEC
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.MISSING_ENDSEC_TAG,
|
||||
"DXF structure error: missing ENDSEC tag.",
|
||||
)
|
||||
)
|
||||
close_section()
|
||||
collector.append(tag)
|
||||
inside_section = True
|
||||
|
||||
def process_structure_tag():
|
||||
if value == "SECTION":
|
||||
open_section()
|
||||
elif value == "ENDSEC":
|
||||
close_section()
|
||||
elif value == "EOF":
|
||||
if inside_section:
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.MISSING_ENDSEC_TAG,
|
||||
"DXF structure error: missing ENDSEC tag.",
|
||||
)
|
||||
)
|
||||
close_section()
|
||||
else:
|
||||
collect()
|
||||
|
||||
def collect():
|
||||
if inside_section:
|
||||
collector.append(tag)
|
||||
else:
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.FOUND_TAG_OUTSIDE_SECTION,
|
||||
f"DXF structure error: found tag outside section: "
|
||||
f"({code}, {value})",
|
||||
)
|
||||
)
|
||||
orphans.append(tag)
|
||||
|
||||
orphans: list[DXFTag] = []
|
||||
sections: list[list[DXFTag]] = []
|
||||
collector: list[DXFTag] = []
|
||||
inside_section = False
|
||||
for tag in tags:
|
||||
code, value = tag
|
||||
if code == 0:
|
||||
process_structure_tag()
|
||||
else:
|
||||
collect()
|
||||
|
||||
sections.append(orphans)
|
||||
return sections
|
||||
|
||||
def load_section_dict(self, sections: list[list[DXFTag]]) -> None:
|
||||
"""Merge sections of same type."""
|
||||
|
||||
def add_section(name: str, tags) -> None:
|
||||
if name in section_dict:
|
||||
section_dict[name].extend(tags[2:])
|
||||
else:
|
||||
section_dict[name] = tags
|
||||
|
||||
def _build_section_dict(d: dict) -> None:
|
||||
for name, section in d.items():
|
||||
if name in const.MANAGED_SECTIONS:
|
||||
self.section_dict[name] = list(group_tags(section, 0))
|
||||
|
||||
def _remove_unsupported_sections(d: dict):
|
||||
for name in ("CLASSES", "OBJECTS", "ACDSDATA"):
|
||||
if name in d:
|
||||
del d[name]
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.REMOVED_UNSUPPORTED_SECTION,
|
||||
f"Removed unsupported {name} section for DXF R12.",
|
||||
)
|
||||
)
|
||||
|
||||
# Last section could be orphaned tags:
|
||||
orphans = sections.pop()
|
||||
if orphans and orphans[0] == (0, "SECTION"):
|
||||
# The last section contains not the orphaned tags:
|
||||
sections.append(orphans)
|
||||
orphans = []
|
||||
|
||||
section_dict: "SectionDict" = dict()
|
||||
for section in sections:
|
||||
code, name = section[1]
|
||||
if code == 2:
|
||||
add_section(name, section)
|
||||
else: # invalid section name tag e.g. (2, "HEADER")
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.MISSING_SECTION_NAME_TAG,
|
||||
"DXF structure error: missing section name tag, ignore section.",
|
||||
)
|
||||
)
|
||||
|
||||
header = section_dict.setdefault(
|
||||
"HEADER",
|
||||
[
|
||||
DXFTag(0, "SECTION"), # type: ignore
|
||||
DXFTag(2, "HEADER"), # type: ignore
|
||||
],
|
||||
)
|
||||
self.rescue_orphaned_header_vars(header, orphans) # type: ignore
|
||||
self.dxfversion = _detect_dxf_version(header)
|
||||
if self.dxfversion <= const.DXF12:
|
||||
_remove_unsupported_sections(section_dict)
|
||||
_build_section_dict(section_dict)
|
||||
|
||||
def rebuild_tables(self, tables: list[Tags]) -> list[Tags]:
|
||||
"""Rebuild TABLES section."""
|
||||
|
||||
# Note: the recover module does not report invalid placed table entries,
|
||||
# it just recovers them. The "normal" loading process ignore these
|
||||
# misplaced table entries and logs a warning.
|
||||
|
||||
def append_table(name: str):
|
||||
if name not in content:
|
||||
return
|
||||
|
||||
head = heads.get(name)
|
||||
if head:
|
||||
tables.append(head)
|
||||
else:
|
||||
# The new table head gets a valid handle from Auditor.
|
||||
tables.append(Tags([DXFTag(0, "TABLE"), DXFTag(2, name)]))
|
||||
tables.extend(content[name])
|
||||
tables.append(Tags([DXFTag(0, "ENDTAB")]))
|
||||
|
||||
heads = dict()
|
||||
content = defaultdict(list)
|
||||
valid_tables = set(const.TABLE_NAMES_ACAD_ORDER)
|
||||
|
||||
for entry in tables:
|
||||
name = entry[0].value.upper()
|
||||
if name == "TABLE":
|
||||
try:
|
||||
table_name = entry[1].value.upper()
|
||||
except (IndexError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
heads[table_name] = entry
|
||||
elif name in valid_tables:
|
||||
content[name].append(entry)
|
||||
tables = [Tags([DXFTag(0, "SECTION"), DXFTag(2, "TABLES")])]
|
||||
|
||||
names = list(const.TABLE_NAMES_ACAD_ORDER)
|
||||
if self.dxfversion <= const.DXF12:
|
||||
# Ignore BLOCK_RECORD table
|
||||
names.remove("BLOCK_RECORD")
|
||||
if "BLOCK_RECORD" in content:
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.REMOVED_UNSUPPORTED_TABLE,
|
||||
f"Removed unsupported BLOCK_RECORD table for DXF R12.",
|
||||
)
|
||||
)
|
||||
|
||||
for name in names:
|
||||
append_table(name)
|
||||
return tables
|
||||
|
||||
def rescue_orphaned_header_vars(
|
||||
self, header: list[DXFTag], orphans: Iterable[DXFTag]
|
||||
) -> None:
|
||||
var_name = None
|
||||
for tag in orphans:
|
||||
code, value = tag
|
||||
if code == 9:
|
||||
var_name = tag
|
||||
elif var_name is not None:
|
||||
header.append(var_name)
|
||||
header.append(tag)
|
||||
var_name = None
|
||||
|
||||
def check_entities(self, entities: list[Tags], is_r12: bool) -> Iterator[Tags]:
|
||||
subclass_markers = (100,)
|
||||
for entity in entities:
|
||||
_, dxftype = entity[0]
|
||||
if dxftype in EXCLUDE_STRUCTURE_CHECK:
|
||||
yield entity
|
||||
else:
|
||||
# raises DXFStructureError() for invalid entities
|
||||
tags = Tags(entity_structure_validator(entity))
|
||||
if is_r12:
|
||||
# subclass markers (100, ...) in DXF R12 files confuses the
|
||||
# ezdxf parser #1106
|
||||
tags.remove_tags(subclass_markers)
|
||||
yield tags
|
||||
|
||||
def recover_rootdict(self):
|
||||
objects = self.section_dict.get("OBJECTS")
|
||||
if not objects or len(objects) < 2:
|
||||
return # empty OBJECTS section
|
||||
# index 0 is [DXFTag(0, 'SECTION'), DXFTag(2, 'OBJECTS')], this is a
|
||||
# requirement to be stored in the section_dict!
|
||||
if _is_rootdict(objects[1]):
|
||||
return # everything is fine
|
||||
index, rootdict = _find_rootdict(objects)
|
||||
if index: # make rootdict to first entity in OBJECTS section
|
||||
objects[index] = objects[1]
|
||||
objects[1] = rootdict
|
||||
try:
|
||||
handle = rootdict.get_handle()
|
||||
except const.DXFValueError:
|
||||
handle = "None"
|
||||
self.fixes.append(
|
||||
(
|
||||
AuditError.MISPLACED_ROOT_DICT,
|
||||
f"Recovered misplaced root DICTIONARY(#{handle}).",
|
||||
)
|
||||
)
|
||||
|
||||
def fix_broken_layout_links(self):
|
||||
"""Fixes broke links (block_record_handle) between LAYOUT and BLOCK_RECORD
|
||||
entities. See issue #997 for more information.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def _detect_dxf_version(header: list) -> str:
|
||||
next_is_dxf_version = False
|
||||
for tag in header:
|
||||
if next_is_dxf_version:
|
||||
dxfversion = str(tag[1]).strip()
|
||||
if re.fullmatch(r"AC[0-9]{4}", dxfversion):
|
||||
return dxfversion
|
||||
else:
|
||||
break
|
||||
if tag == (9, "$ACADVER"):
|
||||
next_is_dxf_version = True
|
||||
return const.DXF12
|
||||
|
||||
|
||||
def _is_rootdict(entity: Tags) -> bool:
|
||||
if entity[0] != (0, "DICTIONARY"):
|
||||
return False
|
||||
# The entry "ACAD_GROUP" in the rootdict is absolutely necessary!
|
||||
return any(tag == (3, "ACAD_GROUP") for tag in entity)
|
||||
|
||||
|
||||
def _find_rootdict(objects: list[Tags]) -> tuple[int, Tags]:
|
||||
for index, entity in enumerate(objects):
|
||||
if _is_rootdict(entity):
|
||||
return index, entity
|
||||
return 0, Tags()
|
||||
|
||||
|
||||
def safe_tag_loader(
|
||||
stream: BinaryIO,
|
||||
loader: Optional[Callable] = None,
|
||||
messages: Optional[list] = None,
|
||||
errors: str = "surrogateescape",
|
||||
) -> Iterator[DXFTag]:
|
||||
"""Yields :class:``DXFTag`` objects from a bytes `stream`
|
||||
(untrusted external source), skips all comment tags (group code == 999).
|
||||
|
||||
- Fixes unordered and invalid vertex tags.
|
||||
- Pass :func:`synced_bytes_loader` as argument `loader` to brute force
|
||||
load invalid tag structure.
|
||||
|
||||
Args:
|
||||
stream: input data stream as bytes
|
||||
loader: low level tag loader, default loader is :func:`bytes_loader`
|
||||
messages: list to store error messages
|
||||
errors: specify decoding error handler
|
||||
|
||||
- "surrogateescape" to preserve possible binary data (default)
|
||||
- "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data
|
||||
- "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data
|
||||
|
||||
"""
|
||||
if loader is None:
|
||||
loader = bytes_loader
|
||||
tags, detector_stream = itertools.tee(loader(stream), 2)
|
||||
encoding = detect_encoding(detector_stream)
|
||||
|
||||
# Apply repair filter:
|
||||
tags = repair.tag_reorder_layer(tags)
|
||||
tags = repair.filter_invalid_point_codes(tags) # type: ignore
|
||||
tags = repair.filter_invalid_handles(tags)
|
||||
return byte_tag_compiler(tags, encoding, messages=messages, errors=errors)
|
||||
|
||||
|
||||
INT_PATTERN_S = re.compile(r"[+-]?\d+")
|
||||
INT_PATTERN_B = re.compile(rb"[+-]?\d+")
|
||||
|
||||
|
||||
def _search_int(s: Union[str, bytes]) -> int:
|
||||
"""Emulate the behavior of the C function stoll(), which just stop
|
||||
converting strings to integers at the first invalid char without raising
|
||||
an exception. e.g. "42xyz" is a valid integer 42
|
||||
|
||||
"""
|
||||
res = re.search(
|
||||
INT_PATTERN_S if isinstance(s, str) else INT_PATTERN_B, s # type: ignore
|
||||
)
|
||||
if res:
|
||||
s = res.group()
|
||||
return int(s)
|
||||
|
||||
|
||||
FLOAT_PATTERN_S = re.compile(r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?")
|
||||
FLOAT_PATTERN_B = re.compile(rb"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?")
|
||||
|
||||
|
||||
def _search_float(s: Union[str, bytes]) -> float:
|
||||
"""Emulate the behavior of the C function stod(), which just stop
|
||||
converting strings to doubles at the first invalid char without raising
|
||||
an exception. e.g. "47.11xyz" is a valid double 47.11
|
||||
|
||||
"""
|
||||
res = re.search(
|
||||
FLOAT_PATTERN_S if isinstance(s, str) else FLOAT_PATTERN_B, s # type: ignore
|
||||
)
|
||||
if res:
|
||||
s = res.group()
|
||||
return float(s)
|
||||
|
||||
|
||||
@typing.no_type_check
|
||||
def bytes_loader(stream: BinaryIO) -> Iterator[DXFTag]:
|
||||
"""Yields :class:``DXFTag`` objects from a bytes `stream`
|
||||
(untrusted external source), skips all comment tags (group code == 999).
|
||||
|
||||
``DXFTag.code`` is always an ``int`` and ``DXFTag.value`` is always a
|
||||
raw bytes value without line endings. Works with file system streams and
|
||||
:class:`BytesIO` streams.
|
||||
|
||||
Raises:
|
||||
DXFStructureError: Found invalid group code.
|
||||
|
||||
"""
|
||||
eof = False
|
||||
line = 1
|
||||
readline = stream.readline
|
||||
while not eof:
|
||||
code = readline()
|
||||
# ByteIO(): empty strings indicates EOF - does not raise an exception
|
||||
if code:
|
||||
try:
|
||||
code = int(code)
|
||||
except ValueError:
|
||||
try: # harder to find an int
|
||||
code = _search_int(code)
|
||||
except ValueError:
|
||||
code = code.decode(errors="ignore").rstrip("\r\n")
|
||||
raise const.DXFStructureError(
|
||||
f'Invalid group code "{code}" at line {line}.'
|
||||
)
|
||||
else:
|
||||
return
|
||||
|
||||
value = readline()
|
||||
# ByteIO(): empty strings indicates EOF
|
||||
if value:
|
||||
value = value.rstrip(b"\r\n")
|
||||
if code == 0 and value == b"EOF":
|
||||
eof = True
|
||||
if code != 999:
|
||||
yield DXFTag(code, value)
|
||||
line += 2
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
def synced_bytes_loader(stream: BinaryIO) -> Iterator[DXFTag]:
|
||||
"""Yields :class:``DXFTag`` objects from a bytes `stream`
|
||||
(untrusted external source), skips all comment tags (group code == 999).
|
||||
|
||||
``DXFTag.code`` is always an ``int`` and ``DXFTag.value`` is always a
|
||||
raw bytes value without line endings. Works with file system streams and
|
||||
:class:`BytesIO` streams.
|
||||
|
||||
Does not raise DXFStructureError on invalid group codes, instead skips
|
||||
lines until a valid group code or EOF is found.
|
||||
|
||||
This can remove invalid lines before group codes, but can not
|
||||
detect invalid lines between group code and tag value.
|
||||
|
||||
"""
|
||||
code = 999
|
||||
upper_boundary = MAX_GROUP_CODE + 1
|
||||
readline = stream.readline
|
||||
while True:
|
||||
seeking_valid_group_code = True
|
||||
while seeking_valid_group_code:
|
||||
code = readline() # type: ignore
|
||||
if code:
|
||||
try: # hard to find an int
|
||||
code = _search_int(code) # type: ignore
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
if 0 <= code < upper_boundary:
|
||||
seeking_valid_group_code = False
|
||||
else:
|
||||
return # empty string is EOF
|
||||
value = readline()
|
||||
if value:
|
||||
if code != 999:
|
||||
yield DXFTag(code, value.rstrip(b"\r\n"))
|
||||
else:
|
||||
return # empty string is EOF
|
||||
|
||||
|
||||
DWGCODEPAGE = b"$DWGCODEPAGE"
|
||||
ACADVER = b"$ACADVER"
|
||||
|
||||
|
||||
def _strip_whitespace(s: str) -> str:
|
||||
ws = set(string.whitespace)
|
||||
return "".join([c for c in s if c not in ws])
|
||||
|
||||
|
||||
def detect_encoding(tags: Iterable[DXFTag]) -> str:
|
||||
"""Detect text encoding from header variables $DWGCODEPAGE and $ACADVER
|
||||
out of a stream of DXFTag objects.
|
||||
|
||||
Assuming a malformed DXF file:
|
||||
|
||||
The header variables could reside outside of the HEADER section,
|
||||
an ENDSEC tag is not a reliable fact that no $DWGCODEPAGE or
|
||||
$ACADVER header variable will show up in the remaining tag stream.
|
||||
|
||||
Worst case: DXF file without a $ACADVER var, and a $DWGCODEPAGE
|
||||
unequal to "ANSI_1252" at the end of the file.
|
||||
|
||||
"""
|
||||
encoding = None
|
||||
dxfversion = None
|
||||
next_tag = None
|
||||
|
||||
for code, value in tags:
|
||||
if code == 9:
|
||||
if value == DWGCODEPAGE:
|
||||
next_tag = DWGCODEPAGE # e.g. (3, "ANSI_1252")
|
||||
elif value == ACADVER:
|
||||
next_tag = ACADVER # e.g. (1, "AC1012")
|
||||
elif code == 3 and next_tag == DWGCODEPAGE:
|
||||
encoding = toencoding(value.decode(const.DEFAULT_ENCODING))
|
||||
next_tag = None
|
||||
elif code == 1 and next_tag == ACADVER:
|
||||
dxfversion = value.decode(const.DEFAULT_ENCODING)
|
||||
next_tag = None
|
||||
|
||||
if encoding and dxfversion:
|
||||
return "utf8" if dxfversion >= const.DXF2007 else encoding
|
||||
|
||||
return const.DEFAULT_ENCODING
|
||||
|
||||
|
||||
@typing.no_type_check
|
||||
def byte_tag_compiler(
|
||||
tags: Iterable[DXFTag],
|
||||
encoding=const.DEFAULT_ENCODING,
|
||||
messages: Optional[list] = None,
|
||||
errors: str = "surrogateescape",
|
||||
) -> Iterator[DXFTag]:
|
||||
"""Compiles DXF tag values imported by bytes_loader() into Python types.
|
||||
|
||||
Raises DXFStructureError() for invalid float values and invalid coordinate
|
||||
values.
|
||||
|
||||
Expects DXF coordinates written in x, y[, z] order, see function
|
||||
:func:`safe_tag_loader` for usage with applied repair filters.
|
||||
|
||||
Args:
|
||||
tags: DXF tag generator, yielding tag values as bytes like bytes_loader()
|
||||
encoding: text encoding
|
||||
messages: list to store error messages
|
||||
errors: specify decoding error handler
|
||||
|
||||
- "surrogateescape" to preserve possible binary data (default)
|
||||
- "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data
|
||||
- "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data
|
||||
|
||||
Raises:
|
||||
DXFStructureError: Found invalid DXF tag or unexpected coordinate order.
|
||||
|
||||
"""
|
||||
|
||||
def error_msg(tag):
|
||||
code = tag.code
|
||||
value = tag.value.decode(encoding)
|
||||
return f'Invalid tag ({code}, "{value}") near line: {line}.'
|
||||
|
||||
def recover_int(s: Union[str, bytes]) -> int:
|
||||
if isinstance(s, bytes):
|
||||
s = s.decode(encoding="utf8", errors="ignore")
|
||||
value = _search_int(s)
|
||||
msg = f'recovered invalid integer value "{s}" near line {line} as "{value}"'
|
||||
messages.append((AuditError.INVALID_INTEGER_VALUE, msg))
|
||||
logger.warning(msg)
|
||||
return value
|
||||
|
||||
def recover_float(s: Union[str, bytes]) -> float:
|
||||
if isinstance(s, bytes):
|
||||
s = s.decode(encoding="utf8", errors="ignore")
|
||||
s = _strip_whitespace(s)
|
||||
value = _search_float(s)
|
||||
msg = f'recovered invalid floating point value "{s}" near line {line} as "{value}"'
|
||||
messages.append((AuditError.INVALID_FLOATING_POINT_VALUE, msg))
|
||||
logger.warning(msg)
|
||||
return value
|
||||
|
||||
assert isinstance(encoding, str)
|
||||
assert isinstance(errors, str)
|
||||
|
||||
if messages is None:
|
||||
messages = []
|
||||
tags = iter(tags)
|
||||
undo_tag = None
|
||||
line = 0
|
||||
while True:
|
||||
try:
|
||||
if undo_tag is not None:
|
||||
x = undo_tag
|
||||
undo_tag = None
|
||||
else:
|
||||
x = next(tags)
|
||||
line += 2
|
||||
code = x.code
|
||||
if code in POINT_CODES:
|
||||
y = next(tags) # y coordinate is mandatory
|
||||
line += 2
|
||||
# e.g. y-code for x-code=10 is 20
|
||||
if y.code != code + 10:
|
||||
raise const.DXFStructureError(
|
||||
f"Missing required y-coordinate near line: {line}."
|
||||
)
|
||||
# optional z coordinate
|
||||
z = next(tags)
|
||||
line += 2
|
||||
try:
|
||||
# is it a z-coordinate like (30, 0.0) for base x-code=10
|
||||
if z.code == code + 20:
|
||||
try:
|
||||
point = (
|
||||
float(x.value),
|
||||
float(y.value),
|
||||
float(z.value),
|
||||
)
|
||||
except ValueError: # search for any float values
|
||||
point = (
|
||||
recover_float(x.value),
|
||||
recover_float(y.value),
|
||||
recover_float(z.value),
|
||||
)
|
||||
else:
|
||||
try:
|
||||
point = (float(x.value), float(y.value))
|
||||
except ValueError: # search for any float values
|
||||
point = (
|
||||
recover_float(x.value),
|
||||
recover_float(y.value),
|
||||
)
|
||||
undo_tag = z
|
||||
except ValueError:
|
||||
raise const.DXFStructureError(
|
||||
f"Invalid floating point values near line: {line}."
|
||||
)
|
||||
yield DXFVertex(code, point)
|
||||
elif code in BINARY_DATA:
|
||||
# maybe pre compiled in low level tagger (binary DXF)
|
||||
if isinstance(x, DXFBinaryTag):
|
||||
tag = x
|
||||
else:
|
||||
try:
|
||||
tag = DXFBinaryTag.from_string(code, x.value)
|
||||
except ValueError:
|
||||
raise const.DXFStructureError(
|
||||
f"Invalid binary data near line: {line}."
|
||||
)
|
||||
yield tag
|
||||
else: # just a single tag
|
||||
type_ = TYPE_TABLE.get(code, str)
|
||||
value: bytes = x.value
|
||||
if type_ is str:
|
||||
if code == 0:
|
||||
# remove white space from structure tags
|
||||
value = x.value.strip().upper()
|
||||
try: # 2 stages to document decoding errors
|
||||
str_ = value.decode(encoding, errors="strict")
|
||||
except UnicodeDecodeError:
|
||||
str_ = value.decode(encoding, errors=errors)
|
||||
messages.append(
|
||||
(
|
||||
AuditError.DECODING_ERROR,
|
||||
f"Fixed unicode decoding error near line {line}",
|
||||
)
|
||||
)
|
||||
|
||||
# exclude structure tags (code == 0):
|
||||
if code:
|
||||
# Convert DXF-Unicode notation "\U+xxxx" to unicode
|
||||
if has_dxf_unicode(str_):
|
||||
str_ = decode_dxf_unicode(str_)
|
||||
# Convert MIF notation "\M+cxxxx" to unicode
|
||||
elif has_mif_encoding(str_):
|
||||
str_ = decode_mif_to_unicode(str_)
|
||||
yield DXFTag(code, str_)
|
||||
else:
|
||||
try:
|
||||
# fast path for int and float
|
||||
yield DXFTag(code, type_(value))
|
||||
except ValueError:
|
||||
# slow path - e.g. ProE stores int values as floats :((
|
||||
if type_ is int:
|
||||
try:
|
||||
yield DXFTag(code, recover_int(x.value))
|
||||
except ValueError:
|
||||
raise const.DXFStructureError(error_msg(x))
|
||||
elif type_ is float:
|
||||
try:
|
||||
yield DXFTag(code, recover_float(x.value))
|
||||
except ValueError:
|
||||
raise const.DXFStructureError(error_msg(x))
|
||||
else:
|
||||
raise const.DXFStructureError(error_msg(x))
|
||||
except StopIteration:
|
||||
return
|
||||
Reference in New Issue
Block a user