ok but dupe highways

This commit is contained in:
2025-11-22 10:31:08 -08:00
parent e9e284ee66
commit 6a476bdb2e
4 changed files with 9569 additions and 482391 deletions

View File

@@ -53,21 +53,23 @@ def titlecase(s):
s)
class RoadComparator:
def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0,
n_jobs: int = None, chunk_size: int = 1000):
def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0,
n_jobs: int = None, chunk_size: int = 1000, exclude_unnamed: bool = False):
"""
Initialize the road comparator.
Args:
tolerance_feet: Distance tolerance for considering roads as overlapping (default: 50 feet)
min_gap_length_feet: Minimum length of gap/extra to be considered significant (default: 100 feet)
n_jobs: Number of parallel processes to use (default: CPU count - 1)
chunk_size: Number of geometries to process per chunk (default: 1000)
exclude_unnamed: Exclude features without name/highway tags from coverage (default: False)
"""
self.tolerance_feet = tolerance_feet
self.min_gap_length_feet = min_gap_length_feet
self.n_jobs = n_jobs or max(1, mp.cpu_count() - 1)
self.chunk_size = chunk_size
self.exclude_unnamed = exclude_unnamed
# Convert feet to degrees (approximate conversion for continental US)
# 1 degree latitude ≈ 364,000 feet
@@ -76,8 +78,29 @@ class RoadComparator:
self.min_gap_length_deg = min_gap_length_feet / 364000.0
print(f"Using {self.n_jobs} parallel processes with chunk size {self.chunk_size}")
def load_geojson(self, filepath: str) -> gpd.GeoDataFrame:
if self.exclude_unnamed:
print("Excluding unnamed features from coverage calculation")
def _has_name(self, row) -> bool:
"""Check if a feature has a name tag (for OSM data filtering)."""
# Check for OSM-style tags (stored as JSON string)
if 'tags' in row.index:
tags = row.get('tags')
if isinstance(tags, dict):
return bool(tags.get('name'))
elif isinstance(tags, str):
# Tags stored as JSON string
try:
tags_dict = json.loads(tags)
return bool(tags_dict.get('name'))
except (json.JSONDecodeError, TypeError):
return False
return False
# Check for direct name properties
name = row.get('name') or row.get('NAME') or row.get('FULLNAME')
return bool(name)
def load_geojson(self, filepath: str, filter_unnamed: bool = False) -> gpd.GeoDataFrame:
"""Load and validate GeoJSON file with optimizations."""
try:
# Use pyogr engine for faster loading of large files
@@ -98,7 +121,16 @@ class RoadComparator:
if invalid_mask.any():
print(f"Fixing {invalid_mask.sum()} invalid geometries...")
gdf.loc[invalid_mask, 'geometry'] = gdf.loc[invalid_mask, 'geometry'].buffer(0)
# Filter unnamed features if requested
if filter_unnamed:
original_count = len(gdf)
named_mask = gdf.apply(self._has_name, axis=1)
gdf = gdf[named_mask].copy()
gdf = gdf.reset_index(drop=True)
filtered_count = original_count - len(gdf)
print(f"Filtered out {filtered_count} unnamed features")
print(f"Loaded {len(gdf)} road features from {filepath}")
return gdf
@@ -388,9 +420,10 @@ class RoadComparator:
print(f"Minimum significant length: {self.min_gap_length_feet} feet")
print(f"Parallel processing: {self.n_jobs} workers")
print("-" * 50)
# Load both files
gdf1 = self.load_geojson(file1_path)
# Filter unnamed features from file1 (OSM data) if exclude_unnamed is set
gdf1 = self.load_geojson(file1_path, filter_unnamed=self.exclude_unnamed)
gdf2 = self.load_geojson(file2_path)
# Ensure both are in the same CRS
@@ -444,7 +477,7 @@ class RoadComparator:
print(f"Minimum significant length: {self.min_gap_length_feet} feet")
if removed:
print(f"\n🔴 REMOVED ROADS ({len(removed)} segments):")
print(f"\nREMOVED ROADS ({len(removed)} segments):")
print("These road segments exist in File 1 but are missing or incomplete in File 2:")
# Calculate total length of removed segments
@@ -474,7 +507,7 @@ class RoadComparator:
print(f"{road}: {len(lengths)} segment(s), {road_total:,.1f} feet")
if added:
print(f"\n🔵 ADDED ROADS ({len(added)} roads):")
print(f"\nADDED ROADS ({len(added)} roads):")
print("These roads exist in File 2 but are missing or incomplete in File 1:")
# Calculate total length of added roads
@@ -504,7 +537,7 @@ class RoadComparator:
print(f"{road}: {length:,.1f} feet")
if not removed and not added:
print("\nNo significant differences found!")
print("\nNo significant differences found!")
print("The road networks have good coverage overlap within the specified tolerance.")
@@ -532,7 +565,9 @@ Examples:
help='Number of parallel processes (default: CPU count - 1)')
parser.add_argument('--chunk-size', '-c', type=int, default=1000,
help='Number of geometries to process per chunk (default: 1000)')
parser.add_argument('--exclude-unnamed', '-e', action='store_true',
help='Exclude features without name tags from coverage calculation (helps detect roads covered by unnamed geometry)')
args = parser.parse_args()
# Validate input files
@@ -550,7 +585,8 @@ Examples:
tolerance_feet=args.tolerance,
min_gap_length_feet=args.min_length,
n_jobs=args.jobs,
chunk_size=args.chunk_size
chunk_size=args.chunk_size,
exclude_unnamed=args.exclude_unnamed
)
removed, added = comparator.compare_roads(args.file1, args.file2)