Add and update scripts, finished Sumter

This commit is contained in:
Will Bradley 2025-07-28 18:49:39 -07:00
parent 0cea39bc1d
commit ec0e4a6efb
12 changed files with 175287 additions and 11 deletions

View File

@ -4,8 +4,8 @@ See [https://wiki.openstreetmap.org/wiki/The_Villages_Road_and_Address_Import](h
## Data
Lake County: https://c.lakecountyfl.gov/ftp/GIS/GisDownloads/Shapefiles/
Sumter GIS is via emailing their GIS team and accessing their Dropbox.
- Lake County Streets and Address Points: https://c.lakecountyfl.gov/ftp/GIS/GisDownloads/Shapefiles/
- Sumter GIS Road Centerlines, Addresses, and Multi Modal Trails is via emailing their GIS team and accessing their Dropbox (https://www.dropbox.com/scl/fo/67nh5y8e42tr2kzdmmcg4/AAsF7Ay0MRUN-e_Ajlh5yWQ?rlkey=h6u606av0d2zkszk9lm3qijlt&e=1&st=7j7i94f8&dl=0)
## Instructions
@ -16,6 +16,7 @@ Sumter GIS is via emailing their GIS team and accessing their Dropbox.
### For Sumter County:
* Always use the Filter with Form function to Select all entries with `"LIFECYCLE"='Current'`
* For roads:
* `NAME` becomes the virtual `name` via the `title(formatstreet("NAME"))`
* `SpeedLimit` becomes the virtual `maxspeed` via `concat("SpeedLimit",' mph')`
@ -30,6 +31,8 @@ Sumter GIS is via emailing their GIS team and accessing their Dropbox.
* `POST_CODE` becomes `addr:postcode` (or `addr:postc` temporarily) as an integer
* Manually add `addr:state` = `'FL'`
* For multi-modal trails (golf cart paths):
* Download all highway=path and highway=cycleway with golf_cart=yes for comparison
* Omit `Part_of_Ro`=`Yes` as separate paths; apply golf cart tagging to the streets directly.
* `bicycle=yes`
* `foot=yes`
* `golf=cartpath`
@ -38,7 +41,6 @@ Sumter GIS is via emailing their GIS team and accessing their Dropbox.
* `motor_vehicle=no`
* `segregated=no`
* `surface=asphalt`
* Use the Filter with Form function to Select all entries with `"LIFECYCLE"='Current'`
### For Lake County:
@ -63,7 +65,7 @@ Sumter GIS is via emailing their GIS team and accessing their Dropbox.
* Export to Geojson, only exporting **selected** entries, **selecting only the OSM-formatted fields we want**.
* Here you can rename temporary columns like `addr:house` to `addr:housenumber`.
* Ensure the export file is in the `EPSG:4326 - WGS84` CRS.
* Open in JSOM. It's suggested to begin with roads first, addresses second, so the addresses can be placed in context.
* Open in JOSM. It's suggested to begin with roads first, addresses second, so the addresses can be placed in context.
* In the Roads dataset, select and remove all relations from the geojson/shapefile layer: the data often has one relation per road and this is improper for OSM import.
* Select a small region to work on: one neighborhood or smaller. For this import, we are assuming that only newly-constructed small residential areas will be imported, not main roads or commercial areas or areas with significant existing map data.
* Download the area you're working on from OSM, into a new Data Layer (not your geojson layer.)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -8,12 +8,19 @@ import re
# or >1 suffix-letters, like 12th Street or 243rd Ave.
#
def title(s):
return re.sub(
r"[A-Za-z0-9]+('[A-Za-z0-9]+)?",
lambda word: word.group(0).capitalize(),
s)
# @qgsfunction(args='auto', group='Custom', referenced_columns=[])
def getstreetfromaddress(value1, feature, parent):
parts = value1.split()
parts.pop(0) # Ignore the first bit (i.e. "123" in "123 N MAIN ST")
parts = map(formatstreetname, parts)
return " ".join(parts)
#parts = map(formatstreetname, parts)
#return " ".join(parts)
return formatstreet(" ".join(parts), None, None)
# @qgsfunction(args='auto', group='Custom', referenced_columns=[])
def formatstreet(value1, feature, parent):
@ -26,10 +33,12 @@ def formatstreet(value1, feature, parent):
parts[0] = "Royal"
parts[1] = "Saint"
# And "CR" as a first part (County Road) vs last part (Circle)
if parts[0].upper() == "C ":
parts[0] = "County Road "
if parts[0].upper() == "CR":
parts[0] = "County Road"
if parts[0].upper() == "SR":
parts[0] = "State Route"
parts[0] = "State Road"
parts = map(formatstreetname, parts)
return " ".join(parts)

View File

@ -1,5 +1,75 @@
import geopandas
df = geopandas.read_file('original data/Sumter/RoadCenterlines_041125.shp.zip')
df = df.to_crs(4326) # Convert to WGS 84
exploded = df.explode()
exploded.to_file('original data/Sumter/RoadCenterlines_041125.geojson', driver='GeoJSON')
import sys
import os
from pathlib import Path
def convert_shapefile_to_geojson(
input_shapefile,
output_geojson,
target_crs=4326 # Convert to WGS 84
):
"""
Main conversion function
Args:
input_shapefile: Path to input shapefile
output_geojson: Path to output GeoJSON file
target_crs: Target coordinate reference system
"""
try:
# Read shapefile
print(f"Reading shapefile: {input_shapefile}")
df = geopandas.read_file(input_shapefile)
print(f"Converting to CRS {target_crs}")
df = df.to_crs(target_crs)
exploded = df.explode()
exploded.to_file(output_geojson, driver='GeoJSON')
except Exception as e:
print(f"Error during conversion: {str(e)}")
sys.exit(1)
def main():
"""
Main function to handle command line arguments
"""
import argparse
parser = argparse.ArgumentParser(
description='Convert shapefile to GeoJSON'
)
parser.add_argument(
'input_shapefile',
help='Path to input shapefile'
)
parser.add_argument(
'output_geojson',
help='Path to output GeoJSON file'
)
parser.add_argument(
'--target-crs',
default='4326',
help='Target coordinate reference system (default: 4326)'
)
args = parser.parse_args()
# Validate input file
if not os.path.exists(args.input_shapefile):
print(f"Error: Input shapefile '{args.input_shapefile}' not found")
sys.exit(1)
# Create output directory if it doesn't exist
output_dir = Path(args.output_geojson).parent
output_dir.mkdir(parents=True, exist_ok=True)
# Run conversion
convert_shapefile_to_geojson(
args.input_shapefile,
args.output_geojson,
args.target_crs
)
if __name__ == "__main__":
import pandas as pd
main()

258
sumter-address-convert.py Normal file
View File

@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Shapefile to GeoJSON Converter for Address Data
Converts ESRI:102659 CRS shapefile to EPSG:4326 GeoJSON with OSM-style address tags
"""
import geopandas as gpd
import json
import sys
import os
from pathlib import Path
import importlib
qgis_functions = importlib.import_module("qgis-functions")
title = qgis_functions.title
getstreetfromaddress = qgis_functions.getstreetfromaddress
def convert_crs(gdf, source_crs='ESRI:102659', target_crs='EPSG:4326'):
"""
Convert coordinate reference system from source to target CRS
Args:
gdf: GeoDataFrame to convert
source_crs: Source coordinate reference system (default: ESRI:102659)
target_crs: Target coordinate reference system (default: EPSG:4326)
Returns:
GeoDataFrame with converted CRS
"""
if gdf.crs is None:
print(f"Warning: No CRS detected, assuming {source_crs}")
gdf.crs = source_crs
if gdf.crs != target_crs:
print(f"Converting from {gdf.crs} to {target_crs}")
gdf = gdf.to_crs(target_crs)
return gdf
def process_address_fields(gdf):
"""
Process and map address fields according to OSM address schema
Args:
gdf: GeoDataFrame with address data
Returns:
GeoDataFrame with processed address fields
"""
processed_gdf = gdf.copy()
# Create new columns for OSM address tags
address_mapping = {}
# ADD_NUM -> addr:housenumber (as integer)
if 'ADD_NUM' in processed_gdf.columns:
# Handle NaN values and convert to nullable integer
add_num_series = processed_gdf['ADD_NUM'].copy()
# Convert to numeric, coercing errors to NaN
add_num_series = pd.to_numeric(add_num_series, errors='coerce')
# Round to remove decimal places, then convert to nullable integer
address_mapping['addr:housenumber'] = add_num_series.round().astype('Int64')
# UNIT -> addr:unit (as string)
if 'UNIT' in processed_gdf.columns:
unit_series = processed_gdf['UNIT'].copy()
# Replace NaN, empty strings, and 'None' string with actual None
unit_series = unit_series.replace(['nan', 'None', '', None], None)
# Only keep non-null values as strings
unit_series = unit_series.where(unit_series.notna(), None)
address_mapping['addr:unit'] = unit_series
# SADD -> addr:street via title(getstreetfromaddress("SADD"))
if 'SADD' in processed_gdf.columns:
street_names = []
for sadd_value in processed_gdf['SADD']:
if pd.notna(sadd_value):
street_from_addr = getstreetfromaddress(str(sadd_value), None, None)
street_titled = title(street_from_addr)
street_names.append(street_titled)
else:
street_names.append(None)
address_mapping['addr:street'] = street_names
# POST_COMM -> addr:city via title("POST_COMM")
if 'POST_COMM' in processed_gdf.columns:
city_names = []
for post_comm in processed_gdf['POST_COMM']:
if pd.notna(post_comm):
city_titled = title(str(post_comm))
city_names.append(city_titled)
else:
city_names.append(None)
address_mapping['addr:city'] = city_names
# POST_CODE -> addr:postcode (as integer)
if 'POST_CODE' in processed_gdf.columns:
# Handle NaN values and convert to nullable integer
post_code_series = processed_gdf['POST_CODE'].copy()
# Convert to numeric, coercing errors to NaN
post_code_series = pd.to_numeric(post_code_series, errors='coerce')
# Round to remove decimal places, then convert to nullable integer
address_mapping['addr:postcode'] = post_code_series.round().astype('Int64')
# Manually add addr:state = 'FL'
address_mapping['addr:state'] = 'FL'
# Add the new address columns to the GeoDataFrame
for key, value in address_mapping.items():
processed_gdf[key] = value
return processed_gdf
def clean_output_data(gdf, keep_original_fields=False):
"""
Clean the output data, optionally keeping original fields
Args:
gdf: GeoDataFrame to clean
keep_original_fields: Whether to keep original shapefile fields
Returns:
Cleaned GeoDataFrame
"""
# Define the OSM address fields we want to keep
osm_fields = [
'addr:housenumber', 'addr:unit', 'addr:street',
'addr:city', 'addr:postcode', 'addr:state'
]
if keep_original_fields:
# Keep both original and OSM fields
original_fields = ['ADD_NUM', 'UNIT', 'SADD', 'POST_COMM', 'POST_CODE']
fields_to_keep = list(set(osm_fields + original_fields + ['geometry']))
else:
# Keep only OSM fields and geometry
fields_to_keep = osm_fields + ['geometry']
# Filter to only existing columns
existing_fields = [field for field in fields_to_keep if field in gdf.columns]
return gdf[existing_fields]
def convert_shapefile_to_geojson(
input_shapefile,
output_geojson,
keep_original_fields=False,
source_crs='ESRI:102659',
target_crs='EPSG:4326'
):
"""
Main conversion function
Args:
input_shapefile: Path to input shapefile
output_geojson: Path to output GeoJSON file
keep_original_fields: Whether to keep original shapefile fields
source_crs: Source coordinate reference system
target_crs: Target coordinate reference system
"""
try:
# Read shapefile
print(f"Reading shapefile: {input_shapefile}")
gdf = gpd.read_file(input_shapefile)
print(f"Loaded {len(gdf)} features")
# Display original columns
print(f"Original columns: {list(gdf.columns)}")
# Convert CRS if needed
gdf = convert_crs(gdf, source_crs, target_crs)
# Process address fields
print("Processing address fields...")
gdf = process_address_fields(gdf)
# Clean output data
gdf = clean_output_data(gdf, keep_original_fields)
# Remove rows with no valid geometry
gdf = gdf[gdf.geometry.notna()]
print(f"Final columns: {list(gdf.columns)}")
print(f"Final feature count: {len(gdf)}")
# Write to GeoJSON
print(f"Writing GeoJSON: {output_geojson}")
gdf.to_file(output_geojson, driver='GeoJSON')
print(f"Conversion completed successfully!")
# Display sample of processed data
if len(gdf) > 0:
print("\nSample of processed data:")
sample_cols = [col for col in gdf.columns if col.startswith('addr:')]
if sample_cols:
print(gdf[sample_cols].head())
except Exception as e:
print(f"Error during conversion: {str(e)}")
sys.exit(1)
def main():
"""
Main function to handle command line arguments
"""
import argparse
parser = argparse.ArgumentParser(
description='Convert shapefile to GeoJSON with OSM address tags'
)
parser.add_argument(
'input_shapefile',
help='Path to input shapefile'
)
parser.add_argument(
'output_geojson',
help='Path to output GeoJSON file'
)
parser.add_argument(
'--keep-original',
action='store_true',
help='Keep original shapefile fields in addition to OSM fields'
)
parser.add_argument(
'--source-crs',
default='ESRI:102659',
help='Source coordinate reference system (default: ESRI:102659)'
)
parser.add_argument(
'--target-crs',
default='EPSG:4326',
help='Target coordinate reference system (default: EPSG:4326)'
)
args = parser.parse_args()
# Validate input file
if not os.path.exists(args.input_shapefile):
print(f"Error: Input shapefile '{args.input_shapefile}' not found")
sys.exit(1)
# Create output directory if it doesn't exist
output_dir = Path(args.output_geojson).parent
output_dir.mkdir(parents=True, exist_ok=True)
# Run conversion
convert_shapefile_to_geojson(
args.input_shapefile,
args.output_geojson,
args.keep_original,
args.source_crs,
args.target_crs
)
if __name__ == "__main__":
import pandas as pd
main()

View File

@ -0,0 +1,540 @@
#!/usr/bin/env python3
"""
GeoJSON Multi Modal Golf Cart Path Comparison Script
Compares two GeoJSON files containing road data and identifies:
1. Roads in file1 that don't have corresponding coverage in file2 (removed roads)
2. Roads in file2 that don't have corresponding coverage in file1 (added roads)
Only reports differences that are significant (above minimum length threshold).
Optimized for performance with parallel processing and spatial indexing.
TODO:
- put properties properly on removed roads, so they're visible in JOSM
- handle polygons properly (on previous geojson step?) for circular roads
"""
import json
import argparse
from pathlib import Path
from typing import List, Dict, Any, Tuple
import geopandas as gpd
from shapely.geometry import LineString, MultiLineString, Point, Polygon
from shapely.ops import unary_union
from shapely.strtree import STRtree
import pandas as pd
import warnings
import multiprocessing as mp
from functools import partial
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
import gc
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
class RoadComparator:
def __init__(self, tolerance_feet: float = 50.0, min_gap_length_feet: float = 100.0,
n_jobs: int = None, chunk_size: int = 1000):
"""
Initialize the road comparator.
Args:
tolerance_feet: Distance tolerance for considering roads as overlapping (default: 50 feet)
min_gap_length_feet: Minimum length of gap/extra to be considered significant (default: 100 feet)
n_jobs: Number of parallel processes to use (default: CPU count - 1)
chunk_size: Number of geometries to process per chunk (default: 1000)
"""
self.tolerance_feet = tolerance_feet
self.min_gap_length_feet = min_gap_length_feet
self.n_jobs = n_jobs or max(1, mp.cpu_count() - 1)
self.chunk_size = chunk_size
# Convert feet to degrees (approximate conversion for continental US)
# 1 degree latitude ≈ 364,000 feet
# 1 degree longitude ≈ 288,000 feet (at 40° latitude)
self.tolerance_deg = tolerance_feet / 364000.0
self.min_gap_length_deg = min_gap_length_feet / 364000.0
print(f"Using {self.n_jobs} parallel processes with chunk size {self.chunk_size}")
def load_geojson(self, filepath: str) -> gpd.GeoDataFrame:
"""Load and validate GeoJSON file with optimizations."""
try:
# Use pyogr engine for faster loading of large files
gdf = gpd.read_file(filepath, engine='pyogrio')
# Filter only LineString, MultiLineString, and Polygon geometries
line_types = ['LineString', 'MultiLineString', 'Polygon']
gdf = gdf[gdf.geometry.type.isin(line_types)].copy()
if len(gdf) == 0:
raise ValueError(f"No line geometries found in {filepath}")
# Reset index for efficient processing
gdf = gdf.reset_index(drop=True)
# Ensure geometry is valid and fix simple issues
invalid_mask = ~gdf.geometry.is_valid
if invalid_mask.any():
print(f"Fixing {invalid_mask.sum()} invalid geometries...")
gdf.loc[invalid_mask, 'geometry'] = gdf.loc[invalid_mask, 'geometry'].buffer(0)
print(f"Loaded {len(gdf)} road features from {filepath}")
return gdf
except Exception as e:
raise Exception(f"Error loading {filepath}: {str(e)}")
def create_buffered_union_optimized(self, gdf: gpd.GeoDataFrame) -> Any:
"""Create a buffered union using chunked processing for memory efficiency."""
print("Creating optimized buffered union...")
# Process in chunks to manage memory
chunks = [gdf.iloc[i:i+self.chunk_size] for i in range(0, len(gdf), self.chunk_size)]
chunk_unions = []
# Use partial function for multiprocessing
buffer_func = partial(self._buffer_chunk, tolerance=self.tolerance_deg)
with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
# Submit all chunk processing jobs
future_to_chunk = {executor.submit(buffer_func, chunk): i
for i, chunk in enumerate(chunks)}
# Collect results as they complete
for future in as_completed(future_to_chunk):
chunk_idx = future_to_chunk[future]
try:
chunk_union = future.result()
if chunk_union and not chunk_union.is_empty:
chunk_unions.append(chunk_union)
print(f"Processed chunk {chunk_idx + 1}/{len(chunks)}")
except Exception as e:
print(f"Error processing chunk {chunk_idx}: {str(e)}")
# Union all chunk results
print("Combining chunk unions...")
if chunk_unions:
final_union = unary_union(chunk_unions)
# Force garbage collection
del chunk_unions
gc.collect()
return final_union
else:
raise Exception("No valid geometries to create union")
@staticmethod
def _buffer_chunk(chunk_gdf: gpd.GeoDataFrame, tolerance: float) -> Any:
"""Buffer geometries in a chunk and return their union."""
try:
# Buffer all geometries in the chunk
buffered = chunk_gdf.geometry.buffer(tolerance)
# Create union of buffered geometries
if len(buffered) == 1:
return buffered.iloc[0]
else:
return unary_union(buffered.tolist())
except Exception as e:
print(f"Error in chunk processing: {str(e)}")
return None
def create_spatial_index(self, gdf: gpd.GeoDataFrame) -> STRtree:
"""Create spatial index for fast intersection queries."""
print("Creating spatial index...")
# Create STRtree for fast spatial queries
geometries = gdf.geometry.tolist()
return STRtree(geometries)
def find_removed_segments_optimized(self, source_gdf: gpd.GeoDataFrame,
target_union: Any) -> List[Dict[str, Any]]:
"""
Find segments in source_gdf that are not covered by target_union (removed roads).
Optimized with parallel processing.
"""
print("Finding removed segments...")
# Split into chunks for parallel processing
chunks = [source_gdf.iloc[i:i+self.chunk_size]
for i in range(0, len(source_gdf), self.chunk_size)]
all_removed = []
# Use partial function for multiprocessing
process_func = partial(self._process_removed_chunk,
target_union=target_union,
min_length_deg=self.min_gap_length_deg)
with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
# Submit all chunk processing jobs
future_to_chunk = {executor.submit(process_func, chunk): i
for i, chunk in enumerate(chunks)}
# Collect results as they complete
for future in as_completed(future_to_chunk):
chunk_idx = future_to_chunk[future]
try:
chunk_removed = future.result()
all_removed.extend(chunk_removed)
print(f"Processed removed chunk {chunk_idx + 1}/{len(chunks)}")
except Exception as e:
print(f"Error processing removed chunk {chunk_idx}: {str(e)}")
return all_removed
@staticmethod
def _process_removed_chunk(chunk_gdf: gpd.GeoDataFrame, target_union: Any,
min_length_deg: float) -> List[Dict[str, Any]]:
"""Process a chunk of geometries to find removed segments."""
removed_segments = []
for idx, row in chunk_gdf.iterrows():
geom = row.geometry
# Handle MultiLineString by processing each component
if isinstance(geom, MultiLineString):
lines = list(geom.geoms)
else:
lines = [geom] # Polygon and Line can be accessed directly
for line in lines:
try:
# Find parts of the line that don't intersect with target_union
uncovered = line.difference(target_union)
if uncovered.is_empty:
continue
# Handle different geometry types returned by difference
uncovered_lines = []
if hasattr(uncovered, 'geoms'):
for geom_part in uncovered.geoms:
if isinstance(geom_part, LineString):
uncovered_lines.append(geom_part)
elif isinstance(uncovered, LineString):
uncovered_lines.append(uncovered)
# Check each uncovered line segment
for uncovered_line in uncovered_lines:
if uncovered_line.length >= min_length_deg:
# Create properties dict with original metadata plus 'removed: true'
properties = dict(row.drop('geometry'))
properties['removed'] = True
removed_segments.append({
'geometry': uncovered_line,
**properties
})
except Exception as e:
continue # Skip problematic geometries
return removed_segments
def find_added_roads_optimized(self, source_gdf: gpd.GeoDataFrame,
target_union: Any) -> List[Dict[str, Any]]:
"""
Find entire roads in source_gdf that don't significantly overlap with target_union.
Optimized with parallel processing.
"""
print("Finding added roads...")
# Split into chunks for parallel processing
chunks = [source_gdf.iloc[i:i+self.chunk_size]
for i in range(0, len(source_gdf), self.chunk_size)]
all_added = []
# Use partial function for multiprocessing
process_func = partial(self._process_added_chunk,
target_union=target_union,
min_length_deg=self.min_gap_length_deg)
with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
# Submit all chunk processing jobs
future_to_chunk = {executor.submit(process_func, chunk): i
for i, chunk in enumerate(chunks)}
# Collect results as they complete
for future in as_completed(future_to_chunk):
chunk_idx = future_to_chunk[future]
try:
chunk_added = future.result()
all_added.extend(chunk_added)
print(f"Processed added chunk {chunk_idx + 1}/{len(chunks)}")
except Exception as e:
print(f"Error processing added chunk {chunk_idx}: {str(e)}")
return all_added
@staticmethod
def _process_added_chunk(chunk_gdf: gpd.GeoDataFrame, target_union: Any,
min_length_deg: float) -> List[Dict[str, Any]]:
"""Process a chunk of geometries to find added roads."""
added_roads = []
for idx, row in chunk_gdf.iterrows():
geom = row.geometry
try:
# Check what portion of the road is not covered
uncovered = geom.difference(target_union)
if not uncovered.is_empty:
# Calculate what percentage of the original road is uncovered
uncovered_length = 0
if hasattr(uncovered, 'geoms'):
for geom_part in uncovered.geoms:
if isinstance(geom_part, LineString):
uncovered_length += geom_part.length
elif isinstance(uncovered, LineString):
uncovered_length = uncovered.length
original_length = geom.length
uncovered_ratio = uncovered_length / original_length if original_length > 0 else 0
# Include the entire road if:
# 1. The uncovered portion is above minimum threshold, AND
# 2. More than 10% of the road is uncovered
if uncovered_ratio > 0.1:
#uncovered_length >= min_length_deg and
# Include entire original road with all original metadata
original_properties = dict(row.drop('geometry'))
#
# For Sumter County Roads
#
properties = {
'surface': 'asphalt'
}
output = True
for key, value in original_properties.items():
if key == 'Part_of_Ro' and value == "Yes":
output = False
continue # Skip cart paths that are parts of roads
else:
properties['highway'] = 'residential'
properties['bicycle'] = 'yes'
properties['foot'] = 'yes'
properties['golf'] = 'cartpath'
properties['golf_cart'] = 'yes'
properties['highway'] = 'path'
properties['motor_vehicle'] = 'no'
properties['segregated'] = 'no'
properties['surface'] = 'asphalt'
if output:
added_roads.append({
'geometry': geom,
**properties
})
except Exception as e:
print(e)
continue # Skip problematic geometries
return added_roads
def compare_roads(self, file1_path: str, file2_path: str) -> Tuple[List[Dict], List[Dict]]:
"""
Compare two GeoJSON files and find significant differences.
Optimized version with parallel processing.
Returns:
Tuple of (removed_roads, added_roads)
"""
print(f"Comparing {file1_path} and {file2_path}")
print(f"Tolerance: {self.tolerance_feet} feet")
print(f"Minimum significant length: {self.min_gap_length_feet} feet")
print(f"Parallel processing: {self.n_jobs} workers")
print("-" * 50)
# Load both files
gdf1 = self.load_geojson(file1_path)
gdf2 = self.load_geojson(file2_path)
# Ensure both are in the same CRS
if gdf1.crs != gdf2.crs:
print(f"Warning: CRS mismatch. Converting {file2_path} to match {file1_path}")
gdf2 = gdf2.to_crs(gdf1.crs)
print("Creating optimized spatial unions...")
# Create buffered unions using optimized method
union1 = self.create_buffered_union_optimized(gdf1)
union2 = self.create_buffered_union_optimized(gdf2)
print("Finding removed and added roads with parallel processing...")
# Find roads using optimized parallel methods
removed_roads = self.find_removed_segments_optimized(gdf1, union2)
added_roads = self.find_added_roads_optimized(gdf2, union1)
# Clean up memory
del gdf1, gdf2, union1, union2
gc.collect()
return removed_roads, added_roads
def save_results(self, removed: List[Dict], added: List[Dict], output_path: str):
"""Save results to GeoJSON file."""
all_results = removed + added
if not all_results:
print("No significant differences found!")
return
# Create GeoDataFrame efficiently
print("Saving results...")
results_gdf = gpd.GeoDataFrame(all_results)
# Save to file with optimization
results_gdf.to_file(output_path, driver='GeoJSON', engine='pyogrio')
print(f"Results saved to: {output_path}")
def print_summary(self, removed: List[Dict], added: List[Dict], file1_name: str, file2_name: str):
"""Print a summary of the comparison results."""
print("\n" + "="*60)
print("COMPARISON SUMMARY")
print("="*60)
print(f"\nFile 1: {file1_name}")
print(f"File 2: {file2_name}")
print(f"Tolerance: {self.tolerance_feet} feet")
print(f"Minimum significant length: {self.min_gap_length_feet} feet")
if removed:
print(f"\n🔴 REMOVED ROADS ({len(removed)} segments):")
print("These road segments exist in File 1 but are missing or incomplete in File 2:")
# Calculate total length of removed segments
total_removed_length = 0
removed_by_road = {}
for segment in removed:
geom = segment['geometry']
length_feet = geom.length * 364000.0 # Convert to feet
total_removed_length += length_feet
# Get road name
road_name = "Unknown"
name_fields = ['name', 'NAME', 'road_name', 'street_name', 'FULLNAME']
for field in name_fields:
if field in segment and pd.notna(segment[field]):
road_name = str(segment[field])
break
if road_name not in removed_by_road:
removed_by_road[road_name] = []
removed_by_road[road_name].append(length_feet)
print(f"Total removed length: {total_removed_length:,.1f} feet ({total_removed_length/5280:.2f} miles)")
for road, lengths in sorted(removed_by_road.items()):
road_total = sum(lengths)
print(f"{road}: {len(lengths)} segment(s), {road_total:,.1f} feet")
if added:
print(f"\n🔵 ADDED ROADS ({len(added)} roads):")
print("These roads exist in File 2 but are missing or incomplete in File 1:")
# Calculate total length of added roads
total_added_length = 0
added_by_road = {}
for road in added:
geom = road['geometry']
length_feet = geom.length * 364000.0 # Convert to feet
total_added_length += length_feet
# Get road name
road_name = "Unknown"
name_fields = ['name', 'NAME', 'road_name', 'street_name', 'FULLNAME']
for field in name_fields:
if field in road and pd.notna(road[field]):
road_name = str(road[field])
break
if road_name not in added_by_road:
added_by_road[road_name] = 0
added_by_road[road_name] += length_feet
print(f"Total added length: {total_added_length:,.1f} feet ({total_added_length/5280:.2f} miles)")
for road, length in sorted(added_by_road.items()):
print(f"{road}: {length:,.1f} feet")
if not removed and not added:
print("\n✅ No significant differences found!")
print("The road networks have good coverage overlap within the specified tolerance.")
def main():
parser = argparse.ArgumentParser(
description="Compare two GeoJSON files containing roads and find significant gaps or extras (Optimized)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python sumter-multi-modal-convert.py osm-multi-modal.geojson county-multi-modal.geojson
python sumter-multi-modal-convert.py osm-multi-modal.geojson county-multi-modal.geojson --tolerance 100 --min-length 200
python sumter-multi-modal-convert.py osm-multi-modal.geojson county-multi-modal.geojson --output differences.geojson
python sumter-multi-modal-convert.py osm-multi-modal.geojson county-multi-modal.geojson --jobs 8 --chunk-size 2000
"""
)
parser.add_argument('file1', help='First GeoJSON file')
parser.add_argument('file2', help='Second GeoJSON file')
parser.add_argument('--tolerance', '-t', type=float, default=50.0,
help='Distance tolerance in feet for considering roads as overlapping (default: 50)')
parser.add_argument('--min-length', '-m', type=float, default=100.0,
help='Minimum length in feet for gaps/extras to be considered significant (default: 100)')
parser.add_argument('--output', '-o', help='Output GeoJSON file for results (optional)')
parser.add_argument('--jobs', '-j', type=int, default=None,
help='Number of parallel processes (default: CPU count - 1)')
parser.add_argument('--chunk-size', '-c', type=int, default=1000,
help='Number of geometries to process per chunk (default: 1000)')
args = parser.parse_args()
# Validate input files
if not Path(args.file1).exists():
print(f"Error: File {args.file1} does not exist")
return 1
if not Path(args.file2).exists():
print(f"Error: File {args.file2} does not exist")
return 1
try:
# Create comparator and run comparison
comparator = RoadComparator(
tolerance_feet=args.tolerance,
min_gap_length_feet=args.min_length,
n_jobs=args.jobs,
chunk_size=args.chunk_size
)
removed, added = comparator.compare_roads(args.file1, args.file2)
# Print summary
comparator.print_summary(removed, added, args.file1, args.file2)
# Save results if output file specified
if args.output:
comparator.save_results(removed, added, args.output)
elif removed or added:
# Auto-generate output filename if differences found
output_file = f"multi_modal_differences_{Path(args.file1).stem}_vs_{Path(args.file2).stem}.geojson"
comparator.save_results(removed, added, output_file)
return 0
except Exception as e:
print(f"Error: {str(e)}")
return 1
if __name__ == "__main__":
exit(main())

View File

@ -14,6 +14,13 @@ TODO:
- handle polygons properly (on previous geojson step?) for circular roads
- ignore roads that aren't LIFECYCLE ACTV or Active
- include OneWay=Y
- handle C 44a -> County Road 44A
- handle Tpke -> Turnpike
- handle Trce -> Trace/Terrace?
- handle Cor -> Corner
- handle Obrien -> O'Brien
- handle Oday -> O'Day
- Ohara -> O'Hara
"""
import json