After the review of the project https://review.udacity.com/#!/reviews/293667 I've created a special notebook
for preprocessing the field "address: postcode" in the .osm file. At first, we should find the range of values.
# Import python libraries
import re
import numpy as np
import scipy
import json
import codecs
import matplotlib.pyplot as plt
import xml.etree.cElementTree as ET
%matplotlib inline
# Fuction for counting postcodes and getting values
def zip_codes(filename):
count = 0
data = set()
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "addr:postcode":
count += 1
data.add( tag.attrib['v'] )
return count, data
The next step is to set up the working directory and files for preprocessing:
FILEDIR = "/Users/olgabelitskaya/large-repo/"
SAMPLE_FILE = FILEDIR + "sample_dubai_abu-dhabi.osm"
FILE = FILEDIR + "dubai_abu-dhabi.osm"
FILE0 = FILEDIR + "dubai_abu-dhabi0.osm"
JSON_FILE = FILEDIR + "dubai_abu-dhabi.osm.json"
JSON_FILE0 = FILEDIR + "dubai_abu-dhabi0.osm.json"
Applying the function 'zip_codes':
z = zip_codes(FILE)
The number of the unique values:
len(z[1])
The number of notes with postcodes:
z[0]
Discovering problems in the data:
znp = np.array(sorted(z[1]))
print "All postcodes:", znp
expected = np.append(znp[3:65], znp[66:84])
print "Expected:", expected
unexpected0 = np.append(znp[:3], znp[84:])
unexpected = np.insert(unexpected0, 3, znp[65])
print "Unexpected:", unexpected
Mapping the right values:
correction = {'0': 'NA', '0000': 'NA', '000000': 'NA', '5280 dubai': '5280',
'Muhaisnah 4': 'NA', 'P O BOX 3766': '3766', 'P. O. Box 123234': '123234',
'P. O. Box 31166': '31166', 'P.O. Box 4605': '4605',
'P.O. Box 5618, Abu Dhabi, U.A.E': '5618', 'P.O. Box 6446': '6446',
'P.O. Box 9770': '9770', 'PO Box 114822': '114822', 'PO Box 118737': '118737',
'PO Box 43377': '43377', 'PO Box 6770': '6770'}
correction
Creating the function for updating:
# Function for updating values:
def update_name(name, correction):
if name not in correction.keys():
raise Exception(name)
else:
unexpect = name
replace = correction[unexpect]
if not replace:
raise Exception(unexpect)
updated_name = re.sub(unexpect, replace, name)
return updated_name
# Check the function
update_name('PO Box 43377', correction)
Let's create the list of dictionaries from the .json file:
DICT = []
for line in open(JSON_FILE, 'r+'):
DICT.append(json.loads(line))
len(DICT)
DICT[1200]
Now we can apply the function for updating and check the results.
# Apply the function 'update_name'
for i in range(len(DICT)):
if DICT[i].get('address') != None:
if DICT[i]['address'].get('postcode') != None:
value = DICT[i].get('address').get('postcode')
if value in unexpected:
DICT[i]['address']['postcode'] = update_name(value, correction)
# Check the correction of the postcodes
postcodes = []
for element in DICT:
address = element.get('address')
if address != None:
postcode = address.get('postcode')
if postcode != None:
postcodes.append(postcode)
print postcodes
Finally, let's create a new .json file, insert into the MongoDB collection and compare results.
# Create new file
with open(FILEDIR + "dubai_abu-dhabi_postcode.osm.json", 'w') as f:
for line in DICT:
json.dump(line, f)
f.write('\n')
%load_ext rpy2.ipython
%R m <- mongo("openstreetmap_correct", verbose = FALSE)
%R stream_in(file("/Users/olgabelitskaya/large-repo/dubai_abu-dhabi_postcode.osm.json"),
handler = function(df){m$insert(df)})
# Open databases before correction and after
from pymongo import MongoClient
client = MongoClient('localhost:27017')
database = client['test']
dubai_abu_dhabi = database['openstreetmap']
dubai_abu_dhabi_correct = database['openstreetmap_correct']
# Create a list of zipcodes without correction
zipcodes = dubai_abu_dhabi.aggregate( [
{ "$match" : { "address.postcode" : { "$exists" : 1} } },
{ "$group" : { "_id" : "$address.postcode", "count" : { "$sum" : 1} } },
{ "$sort" : { "count" : -1}}
] )
list(zipcodes)
# Create a list of zipcodes without correction
correct_zipcodes = dubai_abu_dhabi_correct.aggregate( [
{ "$match" : { "address.postcode" : { "$exists" : 1} } },
{ "$group" : { "_id" : "$address.postcode", "count" : { "$sum" : 1} } },
{ "$sort" : { "count" : -1}}
] )
list(correct_zipcodes)
Correction is successful.
The alternative way to update zip codes is to change the file for converting osm format into json:
z0 = zip_codes(FILE0)
z0[0]
znp0 = np.array(sorted(z0[1]))
print "All postcodes:", znp0
expected0 = np.append(znp0[3:67], znp0[68:87])
print "Expected:", expected0
unexpected00 = np.append(znp0[:3], znp0[87:])
unexpected0 = np.insert(unexpected00, 3, znp0[67])
print "Unexpected:", unexpected0
'P.O. Box 9770' in correction.keys()
correction['P.O. Box 9770']
# osm_json_correct.py
# Strings with chars that will cause problems as keys
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
# Function for creating nodes
def shape_element(element):
# Create the empty dictionary for the data in the osm string
node = {}
if element.tag == "node" or element.tag == "way":
# Create the empty dictionary for the 'address' attributes and the list for the 'nd' attribute
address = {}
nd = []
# Add the type and the id of the element
node["type"] = element.tag
node["id"] = element.attrib["id"]
# Add the tag 'visible'
if "visible" in element.attrib.keys():
node["visible"] = element.attrib["visible"]
# Add the geoposition
if "lat" in element.attrib.keys():
node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
# Add the set of the attributes
node["created"] = {"version": element.attrib['version'],
"changeset": element.attrib['changeset'],
"timestamp": element.attrib['timestamp'],
"uid": element.attrib['uid'],
"user": element.attrib['user']}
# Analize the problemchars and add address attributes
for tag in element.iter("tag"):
p = problemchars.search(tag.attrib['k'])
if p:
print "problemchars: ", p.group()
continue
elif tag.attrib['k'][:5] == "addr:":
if ":" in tag.attrib['k'][5:]:
continue
else:
# Correction the postcodes
if tag.attrib['k'] == "addr:postcode":
if tag.attrib['v'] in correction.keys():
address[tag.attrib['k'][5:]] = update_name(tag.attrib['v'],
correction)
else:
address[tag.attrib['k'][5:]] = tag.attrib['v']
else:
address[tag.attrib['k'][5:]] = tag.attrib['v']
else:
node[tag.attrib['k']] = tag.attrib['v']
if address != {}:
node['address'] = address
# Add the 'node_ref' attribute
for tag2 in element.iter("nd"):
nd.append(tag2.attrib['ref'])
if nd != []:
node['node_refs'] = nd
return node
# Skip elements without the tags 'node' or 'way'
else:
return None
# Function for creating the .json file
def process_map(file_in, pretty = False):
# Setup the format for output files
file_out = "{0}.json".format(file_in)
# Create the empty data
data = []
# Open the osm file and read strings
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
# Apply the created function 'shape_element'
el = shape_element(element)
if el:
data.append(el)
# Write the element into the json file
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
# Create a json file
DATA0 = process_map(FILE0)
# Check the correction of the postcodes
postcodes0 = []
for element in DATA0:
address = element.get('address')
if address != None:
postcode = address.get('postcode')
if postcode != None:
postcodes0.append(postcode)
print postcodes0
Correction is successful as well.