Merge branch 'ok-but-dupe-highways'

2025-11-26 13:09:23 -08:00
parent 5cbcf53389 da55bc29a6
commit 86867c5c8d
28 changed files with 913 additions and 2836249 deletions
--- a/diff-highways.py
+++ b/diff-highways.py
@@ -54,22 +54,24 @@ def titlecase(s):
        s)

 class RoadComparator:
-    def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0, 
-                 n_jobs: int = None, chunk_size: int = 200):
+    def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0,
+                 n_jobs: int = None, chunk_size: int = 1000, exclude_unnamed: bool = False):
        """
        Initialize the road comparator.
-        
+
        Args:
            tolerance_feet: Distance tolerance for considering roads as overlapping (default: 50 feet)
            min_gap_length_feet: Minimum length of gap/extra to be considered significant (default: 100 feet)
-            n_jobs: Number of parallel processes to use (default: 2 for Windows)
-            chunk_size: Number of geometries to process per chunk (default: 200 for Windows)
+            n_jobs: Number of parallel processes to use (default: CPU count - 1)
+            chunk_size: Number of geometries to process per chunk (default: 1000)
+            exclude_unnamed: Exclude features without name/highway tags from coverage (default: False)
        """
        self.tolerance_feet = tolerance_feet
        self.min_gap_length_feet = min_gap_length_feet
        # Reduce worker count for Windows to prevent memory issues
        self.n_jobs = n_jobs or min(2, max(1, mp.cpu_count() // 2))
        self.chunk_size = chunk_size
+        self.exclude_unnamed = exclude_unnamed
        
        # Convert feet to degrees (approximate conversion for continental US)
        # 1 degree latitude ≈ 364,000 feet
@@ -78,8 +80,29 @@ class RoadComparator:
        self.min_gap_length_deg = min_gap_length_feet / 364000.0
        
        print(f"Using {self.n_jobs} parallel processes with chunk size {self.chunk_size}")
-    
-    def load_geojson(self, filepath: str) -> gpd.GeoDataFrame:
+        if self.exclude_unnamed:
+            print("Excluding unnamed features from coverage calculation")
+
+    def _has_name(self, row) -> bool:
+        """Check if a feature has a name tag (for OSM data filtering)."""
+        # Check for OSM-style tags (stored as JSON string)
+        if 'tags' in row.index:
+            tags = row.get('tags')
+            if isinstance(tags, dict):
+                return bool(tags.get('name'))
+            elif isinstance(tags, str):
+                # Tags stored as JSON string
+                try:
+                    tags_dict = json.loads(tags)
+                    return bool(tags_dict.get('name'))
+                except (json.JSONDecodeError, TypeError):
+                    return False
+            return False
+        # Check for direct name properties
+        name = row.get('name') or row.get('NAME') or row.get('FULLNAME')
+        return bool(name)
+
+    def load_geojson(self, filepath: str, filter_unnamed: bool = False) -> gpd.GeoDataFrame:
        """Load and validate GeoJSON file with optimizations."""
        try:
            # Use pyogr engine for faster loading of large files
@@ -100,7 +123,16 @@ class RoadComparator:
            if invalid_mask.any():
                print(f"Fixing {invalid_mask.sum()} invalid geometries...")
                gdf.loc[invalid_mask, 'geometry'] = gdf.loc[invalid_mask, 'geometry'].buffer(0)
-            
+
+            # Filter unnamed features if requested
+            if filter_unnamed:
+                original_count = len(gdf)
+                named_mask = gdf.apply(self._has_name, axis=1)
+                gdf = gdf[named_mask].copy()
+                gdf = gdf.reset_index(drop=True)
+                filtered_count = original_count - len(gdf)
+                print(f"Filtered out {filtered_count} unnamed features")
+
            print(f"Loaded {len(gdf)} road features from {filepath}")
            return gdf
            
@@ -111,8 +143,8 @@ class RoadComparator:
        """Create a buffered union using chunked processing for memory efficiency."""
        print("Creating optimized buffered union...")
        
-        # Process in chunks to manage memory
-        chunks = [gdf.iloc[i:i+self.chunk_size] for i in range(0, len(gdf), self.chunk_size)]
+        # Process in chunks to manage memory - extract geometries as lists
+        chunks = [gdf.iloc[i:i+self.chunk_size].geometry.tolist() for i in range(0, len(gdf), self.chunk_size)]
        chunk_unions = []
        
        # Use partial function for multiprocessing
@@ -147,17 +179,17 @@ class RoadComparator:
            raise Exception("No valid geometries to create union")
    
    @staticmethod
-    def _buffer_chunk(chunk_gdf: gpd.GeoDataFrame, tolerance: float) -> Any:
+    def _buffer_chunk(geometries: List, tolerance: float) -> Any:
        """Buffer geometries in a chunk and return their union."""
        try:
            # Buffer all geometries in the chunk
-            buffered = chunk_gdf.geometry.buffer(tolerance)
+            buffered = [geom.buffer(tolerance) for geom in geometries]
            
            # Create union of buffered geometries
            if len(buffered) == 1:
-                return buffered.iloc[0]
+                return buffered[0]
            else:
-                return unary_union(buffered.tolist())
+                return unary_union(buffered)
        except Exception as e:
            print(f"Error in chunk processing: {str(e)}")
            return None
@@ -177,9 +209,17 @@ class RoadComparator:
        """
        print("Finding removed segments...")
        
-        # Split into chunks for parallel processing
-        chunks = [source_gdf.iloc[i:i+self.chunk_size] 
-                 for i in range(0, len(source_gdf), self.chunk_size)]
+        # Split into chunks for parallel processing - convert to serializable format
+        chunks = []
+        for i in range(0, len(source_gdf), self.chunk_size):
+            chunk_gdf = source_gdf.iloc[i:i+self.chunk_size]
+            chunk_data = []
+            for idx, row in chunk_gdf.iterrows():
+                chunk_data.append({
+                    'geometry': row.geometry,
+                    'properties': dict(row.drop('geometry'))
+                })
+            chunks.append(chunk_data)
        
        all_removed = []
        
@@ -210,13 +250,14 @@ class RoadComparator:
        return all_removed
    
    @staticmethod
-    def _process_removed_chunk(chunk_gdf: gpd.GeoDataFrame, target_union: Any, 
+    def _process_removed_chunk(chunk_data: List[Dict], target_union: Any, 
                              min_length_deg: float) -> List[Dict[str, Any]]:
        """Process a chunk of geometries to find removed segments."""
        removed_segments = []
        
-        for idx, row in chunk_gdf.iterrows():
-            geom = row.geometry
+        for row_data in chunk_data:
+            geom = row_data['geometry']
+            properties = row_data['properties']
            
            # Handle MultiLineString by processing each component
            if isinstance(geom, MultiLineString):
@@ -245,12 +286,12 @@ class RoadComparator:
                    for uncovered_line in uncovered_lines:
                        if uncovered_line.length >= min_length_deg:
                            # Create properties dict with original metadata plus 'removed: true'
-                            properties = dict(row.drop('geometry'))
-                            properties['removed'] = True
+                            result_properties = properties.copy()
+                            result_properties['removed'] = True
                            
                            removed_segments.append({
                                'geometry': uncovered_line,
-                                **properties
+                                **result_properties
                            })
                            
                except Exception as e:
@@ -266,9 +307,17 @@ class RoadComparator:
        """
        print("Finding added roads...")
        
-        # Split into chunks for parallel processing
-        chunks = [source_gdf.iloc[i:i+self.chunk_size] 
-                 for i in range(0, len(source_gdf), self.chunk_size)]
+        # Split into chunks for parallel processing - convert to serializable format
+        chunks = []
+        for i in range(0, len(source_gdf), self.chunk_size):
+            chunk_gdf = source_gdf.iloc[i:i+self.chunk_size]
+            chunk_data = []
+            for idx, row in chunk_gdf.iterrows():
+                chunk_data.append({
+                    'geometry': row.geometry,
+                    'properties': dict(row.drop('geometry'))
+                })
+            chunks.append(chunk_data)
        
        all_added = []
        
@@ -299,13 +348,14 @@ class RoadComparator:
        return all_added
    
    @staticmethod
-    def _process_added_chunk(chunk_gdf: gpd.GeoDataFrame, target_union: Any, 
+    def _process_added_chunk(chunk_data: List[Dict], target_union: Any, 
                            min_length_deg: float) -> List[Dict[str, Any]]:
        """Process a chunk of geometries to find added roads."""
        added_roads = []
        
-        for idx, row in chunk_gdf.iterrows():
-            geom = row.geometry
+        for row_data in chunk_data:
+            geom = row_data['geometry']
+            original_properties = row_data['properties']
            
            try:
                # Check what portion of the road is not covered
@@ -328,45 +378,59 @@ class RoadComparator:
                    # 1. The uncovered portion is above minimum threshold, AND
                    # 2. More than 10% of the road is uncovered
                    if uncovered_ratio > 0.1:
-                        #uncovered_length >= min_length_deg and 
+                        #uncovered_length >= min_length_deg and
                        # Include entire original road with all original metadata
-                        original_properties = dict(row.drop('geometry'))

-                        #
-                        # For Sumter County Roads
-                        #
                        properties = {
                            'surface': 'asphalt'
                        }

-                        for key, value in original_properties.items():
-                            if key == 'NAME':
-                                properties['name'] = titlecase(qgisfunctions.formatstreet(value,None,None)) if value is not None else None
-                            elif key == 'SpeedLimit':
-                                properties['maxspeed'] = f"{value} mph" if value is not None else None
-                            elif key == 'RoadClass':
-                                if value is None:
-                                    properties['highway'] = 'residential'
-                                elif value.startswith('PRIMARY'):
-                                    properties['highway'] = 'trunk'
-                                elif value.startswith('MAJOR'):
-                                    properties['highway'] = 'primary'
-                                elif value.startswith('MINOR'):
-                                    properties['highway'] = 'secondary'
-                                elif value.startswith('SECONDARY'):
-                                    properties['highway'] = 'tertiary'
-                                elif value.startswith('TURN LANE'):
-                                    properties['highway'] = 'primary_link'
-                                # elif value.startswith('LOCAL'):
-                                else:
-                                    properties['highway'] = 'residential'
-                            elif lowercase(key) == 'oneway':
-                                properties['oneway'] = 'yes' if lowercase(value) == 'y' or lowercase(value) == 'ft' else None
-                            elif key == 'NumberOfLa':
-                                properties['lanes'] = value if value is not None else None
-                            # else:
-                            #     # Keep other properties as-is, or transform them as needed
-                            #     properties[key] = value
+                        # Detect county format based on available fields
+                        is_lake_county = 'FullStreet' in original_properties
+                        is_sumter_county = 'NAME' in original_properties and 'RoadClass' in original_properties
+
+                        if is_lake_county:
+                            # Lake County field mappings
+                            for key, value in original_properties.items():
+                                if key == 'FullStreet':
+                                    properties['name'] = titlecase(qgisfunctions.formatstreet(value,None,None)) if value is not None else None
+                                elif key == 'SpeedLimit':
+                                    properties['maxspeed'] = f"{value} mph" if value is not None else None
+                                elif key == 'NumberOfLa':
+                                    try:
+                                        num_value = int(float(value)) if value is not None else 0
+                                        if num_value > 0:
+                                            properties['lanes'] = str(num_value)
+                                    except (ValueError, TypeError):
+                                        pass
+                                elif key == 'StreetClas':
+                                    highway_type = qgisfunctions.gethighwaytype(value, None, None)
+                                    properties['highway'] = highway_type if highway_type else 'residential'
+                        elif is_sumter_county:
+                            # Sumter County field mappings
+                            for key, value in original_properties.items():
+                                if key == 'NAME':
+                                    properties['name'] = titlecase(qgisfunctions.formatstreet(value,None,None)) if value is not None else None
+                                elif key == 'SpeedLimit':
+                                    properties['maxspeed'] = f"{value} mph" if value is not None else None
+                                elif key == 'RoadClass':
+                                    if value is None:
+                                        properties['highway'] = 'residential'
+                                    elif value.startswith('PRIMARY'):
+                                        properties['highway'] = 'trunk'
+                                    elif value.startswith('MAJOR'):
+                                        properties['highway'] = 'primary'
+                                    else:
+                                        properties['highway'] = 'residential'
+                        else:
+                            # Unknown format - try common field names
+                            name = original_properties.get('NAME') or original_properties.get('FullStreet') or original_properties.get('name')
+                            if name:
+                                properties['name'] = titlecase(qgisfunctions.formatstreet(name,None,None))
+                            speed = original_properties.get('SpeedLimit')
+                            if speed:
+                                properties['maxspeed'] = f"{speed} mph"
+                            properties['highway'] = 'residential'

                        added_roads.append({
                            'geometry': geom,
@@ -392,9 +456,10 @@ class RoadComparator:
        print(f"Minimum significant length: {self.min_gap_length_feet} feet")
        print(f"Parallel processing: {self.n_jobs} workers")
        print("-" * 50)
-        
+
        # Load both files
-        gdf1 = self.load_geojson(file1_path)
+        # Filter unnamed features from file1 (OSM data) if exclude_unnamed is set
+        gdf1 = self.load_geojson(file1_path, filter_unnamed=self.exclude_unnamed)
        gdf2 = self.load_geojson(file2_path)
        
        # Ensure both are in the same CRS
@@ -432,9 +497,20 @@ class RoadComparator:
        print("Saving results...")
        results_gdf = gpd.GeoDataFrame(all_results)
        
-        # Save to file with optimization
-        results_gdf.to_file(output_path, driver='GeoJSON', engine='pyogrio')
-        print(f"Results saved to: {output_path}")
+        # Save to file with optimization, with fallback for locked files
+        try:
+            results_gdf.to_file(output_path, driver='GeoJSON', engine='pyogrio')
+            print(f"Results saved to: {output_path}")
+        except (PermissionError, OSError) as e:
+            # File is locked, try with a timestamp suffix
+            from datetime import datetime
+            timestamp = datetime.now().strftime("%H%M%S")
+            base = Path(output_path)
+            fallback_path = str(base.parent / f"{base.stem}_{timestamp}{base.suffix}")
+            print(f"Warning: Could not save to {output_path}: {e}")
+            print(f"Saving to fallback: {fallback_path}")
+            results_gdf.to_file(fallback_path, driver='GeoJSON', engine='pyogrio')
+            print(f"Results saved to: {fallback_path}")
    
    def print_summary(self, removed: List[Dict], added: List[Dict], file1_name: str, file2_name: str):
        """Print a summary of the comparison results."""
@@ -448,7 +524,7 @@ class RoadComparator:
        print(f"Minimum significant length: {self.min_gap_length_feet} feet")
        
        if removed:
-            print(f"\n🔴 REMOVED ROADS ({len(removed)} segments):")
+            print(f"\nREMOVED ROADS ({len(removed)} segments):")
            print("These road segments exist in File 1 but are missing or incomplete in File 2:")
            
            # Calculate total length of removed segments
@@ -478,7 +554,7 @@ class RoadComparator:
                print(f"  • {road}: {len(lengths)} segment(s), {road_total:,.1f} feet")
        
        if added:
-            print(f"\n🔵 ADDED ROADS ({len(added)} roads):")
+            print(f"\nADDED ROADS ({len(added)} roads):")
            print("These roads exist in File 2 but are missing or incomplete in File 1:")
            
            # Calculate total length of added roads
@@ -508,7 +584,7 @@ class RoadComparator:
                print(f"  • {road}: {length:,.1f} feet")
        
        if not removed and not added:
-            print("\n✅ No significant differences found!")
+            print("\nNo significant differences found!")
            print("The road networks have good coverage overlap within the specified tolerance.")


@@ -536,7 +612,9 @@ Examples:
                       help='Number of parallel processes (default: CPU count - 1)')
    parser.add_argument('--chunk-size', '-c', type=int, default=1000,
                       help='Number of geometries to process per chunk (default: 1000)')
-    
+    parser.add_argument('--exclude-unnamed', '-e', action='store_true',
+                       help='Exclude features without name tags from coverage calculation (helps detect roads covered by unnamed geometry)')
+
    args = parser.parse_args()
    
    # Validate input files
@@ -554,7 +632,8 @@ Examples:
            tolerance_feet=args.tolerance,
            min_gap_length_feet=args.min_length,
            n_jobs=args.jobs,
-            chunk_size=args.chunk_size
+            chunk_size=args.chunk_size,
+            exclude_unnamed=args.exclude_unnamed
        )
        
        removed, added = comparator.compare_roads(args.file1, args.file2)