refactor: excel parse

2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
@@ -0,0 +1,141 @@
+#  Copyright (c) 2022, Manfred Moitzi
+#  License: MIT License
+from __future__ import annotations
+from typing import Iterator, Iterable, Optional
+import random
+import statistics
+import itertools
+import operator
+from collections import defaultdict
+from functools import reduce
+
+from ezdxf.math import AnyVec, Vec3, spherical_envelope
+from ezdxf.math.rtree import RTree
+
+__all__ = [
+    "dbscan",
+    "k_means",
+    "average_cluster_radius",
+    "average_intra_cluster_distance",
+]
+
+
+def dbscan(
+    points: list[AnyVec],
+    *,
+    radius: float,
+    min_points: int = 4,
+    rtree: Optional[RTree] = None,
+    max_node_size: int = 5,
+) -> list[list[AnyVec]]:
+    """DBSCAN clustering.
+
+    https://en.wikipedia.org/wiki/DBSCAN
+
+    Args:
+        points: list of points to cluster
+        radius: radius of the dense regions
+        min_points: minimum number of points that needs to be within the
+            `radius` for a point to be a core point (must be >= 2)
+        rtree: optional :class:`~ezdxf.math.rtree.RTree`
+        max_node_size: max node size for internally created RTree
+
+    Returns:
+        list of clusters, each cluster is a list of points
+
+    """
+    if min_points < 2:
+        raise ValueError("min_points must be >= 2")
+    if rtree is None:
+        rtree = RTree(points, max_node_size)
+
+    clusters: list[set[AnyVec]] = []
+    point_set = set(points)
+    while len(point_set):
+        point = point_set.pop()
+        todo = {point}
+        cluster = {point}  # the cluster has only a single entry if noise
+        clusters.append(cluster)
+        while len(todo):
+            chk_point = todo.pop()
+            neighbors = set(rtree.points_in_sphere(chk_point, radius))
+            if len(neighbors) < min_points:
+                continue
+            cluster.add(chk_point)
+            point_set.discard(chk_point)
+            todo |= neighbors.intersection(point_set)
+
+    return [list(cluster) for cluster in clusters]
+
+
+def k_means(
+    points: list[AnyVec], k: int, max_iter: int = 10
+) -> list[list[AnyVec]]:
+    """K-means clustering.
+
+    https://en.wikipedia.org/wiki/K-means_clustering
+
+    Args:
+        points: list of points to cluster
+        k: number of clusters
+        max_iter: max iterations
+
+    Returns:
+        list of clusters, each cluster is a list of points
+
+    """
+
+    def classify(centroids: Iterable[AnyVec]):
+        new_clusters: dict[AnyVec, list[AnyVec]] = defaultdict(list)
+        tree = RTree(centroids)
+        for point in points:
+            nn, _ = tree.nearest_neighbor(point)
+            new_clusters[nn].append(point)
+        return new_clusters
+
+    def recenter() -> Iterator[AnyVec]:
+        for cluster_points in clusters.values():
+            yield Vec3.sum(cluster_points) / len(cluster_points)
+        if len(clusters) < k:  # refill centroids if required
+            yield from random.sample(points, k - len(clusters))
+
+    def is_equal_clustering(old_clusters, new_clusters):
+        def hash_list(lst):
+            lst.sort()
+            return reduce(operator.xor, map(hash, lst))
+
+        h1 = sorted(map(hash_list, old_clusters.values()))
+        h2 = sorted(map(hash_list, new_clusters.values()))
+        return h1 == h2
+
+    if not (1 < k < len(points)):
+        raise ValueError(
+            "invalid argument k: must be in range [2, len(points)-1]"
+        )
+    clusters: dict[AnyVec, list[AnyVec]] = classify(random.sample(points, k))
+    for _ in range(max_iter):
+        new_clusters = classify(recenter())
+        if is_equal_clustering(clusters, new_clusters):
+            break
+        clusters = new_clusters
+    return list(clusters.values())
+
+
+def average_intra_cluster_distance(clusters: list[list[AnyVec]]) -> float:
+    """Returns the average point-to-point intra cluster distance."""
+
+    return statistics.mean(
+        [
+            p.distance(q)
+            for cluster in clusters
+            for (p, q) in itertools.combinations(cluster, 2)
+        ]
+    )
+
+
+def average_cluster_radius(clusters: list[list[AnyVec]]) -> float:
+    """Returns the average cluster radius."""
+
+    return statistics.mean(
+        [spherical_envelope(cluster)[1] for cluster in clusters]
+    )