ok but dupe highways

2025-11-22 10:31:08 -08:00
parent e9e284ee66
commit 6a476bdb2e
4 changed files with 9569 additions and 482391 deletions
--- a/README.md
+++ b/README.md
@@ -5,6 +5,17 @@ See [https://wiki.openstreetmap.org/wiki/The_Villages_Road_and_Address_Import](h
 See compare-addresses.py for an automated way of running the complete address diff toolchain in one step.
 - TODO: fails to split out units

+## New Instructions
+
+### Roads
+
+* Get new data from the county and convert it: 
+  * ``python shp-to-geojson.py "original data/Sumter/RoadCenterlines_041125.shp.zip" "original data/Sumter/RoadCenterlines_041125.geojson"``
+* Get new data from OSM:
+  * `python download-overpass.py --type highways "Sumter County" "Florida" "original data/Sumter/sumter-roads-$(date +%y%m%d).geojson"`
+* Diff the roads:
+  * `python threaded.py --exclude-unnamed --output 'processed data\Sumter\diff-sumter-roads-$(date +%y%m%d).geojson' 'original data\Sumter\sumter-roads-251122.geojson' 'original data\Sumter\RoadCenterlines_041125.geojson'`
+
 ## Data

 - Lake County Streets and Address Points: https://c.lakecountyfl.gov/ftp/GIS/GisDownloads/Shapefiles/
--- a/download-overpass.py
+++ b/download-overpass.py
@@ -3,9 +3,9 @@
 Download OSM data from Overpass API for a given county and save as GeoJSON.

 Usage:
-    python download-overpass.py "Sumter County Florida" highways.geojson
-    python download-overpass.py "Lake County Florida" output/lake-addresses.geojson --type addresses
-    python download-overpass.py "Sumter County Florida" paths.geojson --type multimodal
+    python download-overpass.py --type highways "Sumter County" "Florida" output/roads.geojson
+    python download-overpass.py --type addresses "Lake County" "Florida" output/addresses.geojson
+    python download-overpass.py --type multimodal "Sumter County" "Florida" output/paths.geojson

 TODO:
 - Don't just download roads. Probably ignore relations also.
--- a/data/Sumter/diff-sumter-roads.geojson
+++ b/data/Sumter/diff-sumter-roads.geojson
--- a/threaded.py
+++ b/threaded.py
@@ -53,21 +53,23 @@ def titlecase(s):
        s)

 class RoadComparator:
-    def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0, 
-                 n_jobs: int = None, chunk_size: int = 1000):
+    def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0,
+                 n_jobs: int = None, chunk_size: int = 1000, exclude_unnamed: bool = False):
        """
        Initialize the road comparator.
-        
+
        Args:
            tolerance_feet: Distance tolerance for considering roads as overlapping (default: 50 feet)
            min_gap_length_feet: Minimum length of gap/extra to be considered significant (default: 100 feet)
            n_jobs: Number of parallel processes to use (default: CPU count - 1)
            chunk_size: Number of geometries to process per chunk (default: 1000)
+            exclude_unnamed: Exclude features without name/highway tags from coverage (default: False)
        """
        self.tolerance_feet = tolerance_feet
        self.min_gap_length_feet = min_gap_length_feet
        self.n_jobs = n_jobs or max(1, mp.cpu_count() - 1)
        self.chunk_size = chunk_size
+        self.exclude_unnamed = exclude_unnamed
        
        # Convert feet to degrees (approximate conversion for continental US)
        # 1 degree latitude ≈ 364,000 feet
@@ -76,8 +78,29 @@ class RoadComparator:
        self.min_gap_length_deg = min_gap_length_feet / 364000.0
        
        print(f"Using {self.n_jobs} parallel processes with chunk size {self.chunk_size}")
-    
-    def load_geojson(self, filepath: str) -> gpd.GeoDataFrame:
+        if self.exclude_unnamed:
+            print("Excluding unnamed features from coverage calculation")
+
+    def _has_name(self, row) -> bool:
+        """Check if a feature has a name tag (for OSM data filtering)."""
+        # Check for OSM-style tags (stored as JSON string)
+        if 'tags' in row.index:
+            tags = row.get('tags')
+            if isinstance(tags, dict):
+                return bool(tags.get('name'))
+            elif isinstance(tags, str):
+                # Tags stored as JSON string
+                try:
+                    tags_dict = json.loads(tags)
+                    return bool(tags_dict.get('name'))
+                except (json.JSONDecodeError, TypeError):
+                    return False
+            return False
+        # Check for direct name properties
+        name = row.get('name') or row.get('NAME') or row.get('FULLNAME')
+        return bool(name)
+
+    def load_geojson(self, filepath: str, filter_unnamed: bool = False) -> gpd.GeoDataFrame:
        """Load and validate GeoJSON file with optimizations."""
        try:
            # Use pyogr engine for faster loading of large files
@@ -98,7 +121,16 @@ class RoadComparator:
            if invalid_mask.any():
                print(f"Fixing {invalid_mask.sum()} invalid geometries...")
                gdf.loc[invalid_mask, 'geometry'] = gdf.loc[invalid_mask, 'geometry'].buffer(0)
-            
+
+            # Filter unnamed features if requested
+            if filter_unnamed:
+                original_count = len(gdf)
+                named_mask = gdf.apply(self._has_name, axis=1)
+                gdf = gdf[named_mask].copy()
+                gdf = gdf.reset_index(drop=True)
+                filtered_count = original_count - len(gdf)
+                print(f"Filtered out {filtered_count} unnamed features")
+
            print(f"Loaded {len(gdf)} road features from {filepath}")
            return gdf
            
@@ -388,9 +420,10 @@ class RoadComparator:
        print(f"Minimum significant length: {self.min_gap_length_feet} feet")
        print(f"Parallel processing: {self.n_jobs} workers")
        print("-" * 50)
-        
+
        # Load both files
-        gdf1 = self.load_geojson(file1_path)
+        # Filter unnamed features from file1 (OSM data) if exclude_unnamed is set
+        gdf1 = self.load_geojson(file1_path, filter_unnamed=self.exclude_unnamed)
        gdf2 = self.load_geojson(file2_path)
        
        # Ensure both are in the same CRS
@@ -444,7 +477,7 @@ class RoadComparator:
        print(f"Minimum significant length: {self.min_gap_length_feet} feet")
        
        if removed:
-            print(f"\n🔴 REMOVED ROADS ({len(removed)} segments):")
+            print(f"\nREMOVED ROADS ({len(removed)} segments):")
            print("These road segments exist in File 1 but are missing or incomplete in File 2:")
            
            # Calculate total length of removed segments
@@ -474,7 +507,7 @@ class RoadComparator:
                print(f"  • {road}: {len(lengths)} segment(s), {road_total:,.1f} feet")
        
        if added:
-            print(f"\n🔵 ADDED ROADS ({len(added)} roads):")
+            print(f"\nADDED ROADS ({len(added)} roads):")
            print("These roads exist in File 2 but are missing or incomplete in File 1:")
            
            # Calculate total length of added roads
@@ -504,7 +537,7 @@ class RoadComparator:
                print(f"  • {road}: {length:,.1f} feet")
        
        if not removed and not added:
-            print("\n✅ No significant differences found!")
+            print("\nNo significant differences found!")
            print("The road networks have good coverage overlap within the specified tolerance.")


@@ -532,7 +565,9 @@ Examples:
                       help='Number of parallel processes (default: CPU count - 1)')
    parser.add_argument('--chunk-size', '-c', type=int, default=1000,
                       help='Number of geometries to process per chunk (default: 1000)')
-    
+    parser.add_argument('--exclude-unnamed', '-e', action='store_true',
+                       help='Exclude features without name tags from coverage calculation (helps detect roads covered by unnamed geometry)')
+
    args = parser.parse_args()
    
    # Validate input files
@@ -550,7 +585,8 @@ Examples:
            tolerance_feet=args.tolerance,
            min_gap_length_feet=args.min_length,
            n_jobs=args.jobs,
-            chunk_size=args.chunk_size
+            chunk_size=args.chunk_size,
+            exclude_unnamed=args.exclude_unnamed
        )
        
        removed, added = comparator.compare_roads(args.file1, args.file2)