Parse state and city better

This commit is contained in:
Will Bradley 2021-02-25 01:30:50 -08:00
parent 7624a68152
commit 5bdc629114
2 changed files with 36 additions and 12 deletions

View File

@ -119,9 +119,8 @@ Please ensure you are logged in under a dedicated import account with a user nam
### Internal Notes
- TODO:
- TAZ CIDs aren't working, each CID is a full 200mb export
- Address conflation isn't working, no buildings have numbers
- Consider simplifying geometry, ~50 buildings are drawn twice
- Spot check buildings without addresses
- Double check if geometry is adequately simplified or not
- http://download.geofabrik.de/north-america/us/california/norcal-latest.osm.pbf

View File

@ -15,7 +15,7 @@ ALTER TABLE sonoma_county_building_outlines
ADD COLUMN IF NOT EXISTS conflated boolean DEFAULT FALSE,
ADD COLUMN IF NOT EXISTS main boolean; -- is it the main building on the parcel?
update sonoma_county_building_outlines set "addr:housenumber" = NULL, "addr:street" = NULL, "addr:unit" = NULL, "addr:city" = NULL, "addr:state" = NULL, usecode = NULL, cid = NULL;
update sonoma_county_building_outlines set "addr:housenumber" = NULL, "addr:street" = NULL, "addr:unit" = NULL, "addr:city" = NULL, "addr:state" = NULL, usecode = NULL, cid = NULL, conflated = FALSE, main = FALSE;
-- create local geometry fields and validate geometries
UPDATE sonoma_county_building_outlines SET loc_geom = ST_MakeValid(geom);
@ -28,12 +28,11 @@ ALTER TABLE parcels__public_
ADD COLUMN IF NOT EXISTS "addr:unit" text,
ADD COLUMN IF NOT EXISTS "addr:city" text,
ADD COLUMN IF NOT EXISTS "addr:state" text,
ADD COLUMN IF NOT EXISTS usecode integer,
ADD COLUMN IF NOT EXISTS loc_geom geometry(multipolygon,4326), -- local is the same in this case, except made valid
ADD COLUMN IF NOT EXISTS building_count integer,
ADD COLUMN IF NOT EXISTS repeating BOOLEAN DEFAULT FALSE;
update parcels__public_ set "addr:housenumber" = NULL, "addr:street" = NULL, "addr:unit" = NULL, "addr:city" = NULL, "addr:state" = NULL;
update parcels__public_ set "addr:housenumber" = NULL, "addr:street" = NULL, "addr:unit" = NULL, "addr:city" = NULL, "addr:state" = NULL, building_count = NULL, repeating = FALSE;
-- create local geometry fields and validate geometries
UPDATE parcels__public_ SET loc_geom = ST_MakeValid(geom);
@ -188,13 +187,39 @@ $$ LANGUAGE plpgsql;
-- reset our fields
update parcels__public_ SET "addr:housenumber" = NULL,
"addr:street" = NULL,
"addr:unit" = NULL
where "addr:housenumber" IS NOT NULL;
"addr:unit" = NULL,
"addr:city" = NULL,
"addr:state" = NULL;
-- parse city and state
update parcels__public_ SET "addr:city" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z]+)\* ([A-Za-z]+)$', '\1')),
"addr:state" = REGEXP_REPLACE(situsfmt2,'^([A-Za-z]+)\* ([A-Za-z]+)$', '\2')
where situsfmt2 SIMILAR TO '([A-Za-z]+)\* ([A-Za-z]+)' AND "addr:city" IS NULL;
-- with spaces
update parcels__public_ SET "addr:city" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z ]+)\* ([A-Za-z]+)$', '\1')),
"addr:state" = REGEXP_REPLACE(situsfmt2,'^([A-Za-z ]+)\* ([A-Za-z]+)$', '\2')
where situsfmt2 SIMILAR TO '([A-Za-z ]+)\* ([A-Za-z]+)' AND "addr:city" IS NULL;
-- with spaces and zip
update parcels__public_ SET "addr:city" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z ]+)\* ([A-Za-z]+) [0-9]+$', '\1')),
"addr:state" = REGEXP_REPLACE(situsfmt2,'^([A-Za-z ]+)\* ([A-Za-z]+) [0-9]+$', '\2')
where situsfmt2 SIMILAR TO '([A-Za-z ]+)\* ([A-Za-z]+) [0-9]+' AND "addr:city" IS NULL;
-- with no asterisk, and "Ca" suffix
update parcels__public_ SET "addr:city" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z ]+) CA$', '\1')),
"addr:state" = 'CA'
where situsfmt2 SIMILAR TO '([A-Za-z ]+) CA' AND "addr:city" IS NULL;
-- with no asterisk, "CA" state, and zip
update parcels__public_ SET "addr:city" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z ]+) CA [0-9]+$', '\1')),
"addr:state" = 'CA'
where situsfmt2 SIMILAR TO '([A-Za-z ]+) CA [0-9]+' AND "addr:city" IS NULL;
-- some "cities" we don't want, and state needs to be expanded
update parcels__public_ SET "addr:city" = NULL where "addr:city" = 'Unincorp County';
update parcels__public_ SET "addr:city" = NULL where "addr:city" = 'Unknown';
update parcels__public_ SET "addr:city" = NULL where "addr:city" = 'Area Bodega Bay';
update parcels__public_ SET "addr:city" = NULL where "addr:city" = 'Area Duncans Mills';
update parcels__public_ SET "addr:state" = 'California' where "addr:state" = 'CA';
-- parse city sratr
update parcels__public_ SET "addr:city" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z]+)\*? ([A-Za-z]+)$', '\1')),
"addr:state" = initcap(REGEXP_REPLACE(situsfmt2,'^([A-Za-z]+)\*? ([A-Za-z]+)$', '\2'))
where situsfmt2 SIMILAR TO '([A-Za-z]+)\*? ([A-Za-z]+)' IS NOT NULL;
-- basic 123 Main with no common suffixes or numbers
update parcels__public_ SET "addr:housenumber" = initcap(REGEXP_REPLACE(situsfmt1, '^([0-9]+) ([A-Z]+)$', '\1')),