from IPython.core.display import display, HTML
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import folium
import geopandas as gpd
# from mpl_toolkits.basemap import Basemap
import xml.etree.cElementTree as ET
from collections import defaultdict as dfdict
import numpy as np
import pandas as pd
import pprint
import urllib
import re
import os
import csv
import cerberus
import json
import codecs
from schema import Schema
import sqlite3
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')
from odo import odo, discover, resource
hide_code = ''
HTML('''<script>
code_show = true;
function code_display() {
if (code_show) {
$('div.input').each(function(id) {
if (id == 0 || $(this).html().indexOf('hide_code') > -1) {
$(this).hide();
}
});
$('div.output_prompt').css('opacity', 0);
} else {
$('div.input').each(function(id) {
$(this).show();
});
$('div.output_prompt').css('opacity', 1);
}
code_show = !code_show
}
$( document ).ready(code_display);
</script>
<form action="javascript: code_display()"><input style="opacity: 100" type="submit"
value="Click to show or to hide code cells"></form>''')
Сode snippets of the courses "Intro to Relational Databases", "SQL for Data Analysis" (udacity.com) have been used here.
hide_code
# Function for counting tags
def count_tags(filename):
count = dfdict(int)
for item in ET.iterparse(filename):
count[item[1].tag] += 1
return count
hide_code
# Functions for counting users
def get_user(element):
return
def process_map_users(filename):
users = set()
for _, element in ET.iterparse(filename):
if element.tag == 'node' or element.tag == 'way' or element.tag == 'relation':
users.add(element.attrib['user'])
return users
hide_code
# Strings containing lower case chars
lower = re.compile(r'^([a-z]|_)*$')
# Strings with lower case chars and a ':'
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# Strings with chars that will cause problems as keys.
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
# Function for sorting by key type
def key_type(element, keys):
if element.tag == "tag":
if lower.search(element.attrib['k']) != None:
keys['lower'] += 1
elif lower_colon.search(element.attrib['k']) != None:
keys['lower_colon'] += 1
elif problemchars.search(element.attrib['k']) != None:
keys['problemchars'] += 1
else:
keys['other'] += 1
return keys
# Function for counting keys by type
def process_map_keys(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
hide_code
# Function for counting street addresses
def street_number(file_name):
count = 0
for event, elem in ET.iterparse(file_name, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "addr:street":
count += 1
return count
hide_code
# Function for counting zip codes
def zip_codes(filename):
count = 0
data = set()
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "addr:postcode":
count += 1
data.add( tag.attrib['v'] )
return count, data
hide_code
# Functions for auditing zip codes.
expected=[]
def audit_postcode_range(postcode,tag):
if tag.attrib["v"] not in expected:
if tag.attrib["v"] not in postcode:
postcode[tag.attrib["v"]]=1
else:
postcode[tag.attrib["v"]]+=1
def is_postcode(elem):
return (elem.attrib['k'] == "addr:postcode")
def process_map_postcodes(filename):
postcode={}
osm_file = open(filename, "r")
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_postcode(tag):
audit_postcode_range(postcode,tag)
return postcode
hide_code
# Function for displaying english names
def process_map_names(filename):
count = 0
data = set()
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "name:en":
count += 1
data.add( tag.attrib['v'] )
return count, data
hide_code
# Function for sorting by place
def place_type(element, places):
if element.tag == "node":
for tag in element.iter('tag'):
if tag.attrib['k'] == 'place':
if tag.attrib['v'] == 'city':
places['city'] += 1
elif tag.attrib['v'] == 'town':
places['town'] += 1
elif tag.attrib['v'] == 'village':
places['village'] += 1
elif tag.attrib['v'] == 'hamlet':
places['hamlet'] += 1
elif tag.attrib['v'] == 'island':
places['island'] += 1
else:
places['other'] += 1
return places
# Function for counting places by type.
def process_map_places(filename):
places = {"city": 0, "town": 0, "island" : 0, "village": 0, "hamlet" : 0, "other": 0}
for _, element in ET.iterparse(filename):
places = place_type(element, places)
return places
hide_code
# Functions for auditing street names.
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Avenue", "Boulevard", "Commons", "Court",
"Drive", "Lane", "Parkway", "Place",
"Road", "Square", "Street", "Trail"]
mapping = {'Ave' : 'Avenue',
'Blvd' : 'Boulevard',
'Dr' : 'Drive',
'Ln' : 'Lane',
'Pkwy' : 'Parkway',
'ROAD' : 'Road',
'Rd' : 'Road',
'Rd.' : 'Road',
'road' : 'Road',
'rd' : 'Road',
'STREET' : 'Street',
'St.' : 'Street',
'st.' : 'Street',
'St' : 'Street',
'st' : 'Street',
'street' :"Street",
'Ct' : "Court",
'Cir' : "Circle",
'Cr' : "Court",
'ave' : 'Avenue',
'Hwg' : 'Highway',
'Hwy' : 'Highway',
'Sq' : "Square"}
def audit_street_type(street_types, street_name):
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
street_types[street_type].add(street_name)
def is_street_name(elem):
return (elem.attrib['k'] == "addr:street")
def audit_street_names(filename):
osm_file = open(filename, "r")
street_types = dfdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_street_name(tag):
audit_street_type(street_types, tag.attrib['v'])
return street_types
hide_code
# Functions for updating street names
def update_name(name, mapping, regex):
m = regex.search(name)
if m:
street_type = m.group()
if street_type in mapping:
name = re.sub(regex, mapping[street_type], name)
return name
hide_code
# Functions for creating the sample file
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag
Reference:
http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
"""
context = iter(ET.iterparse(osm_file, events=('start', 'end')))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
hide_code
# Strings containing lower case chars
lower = re.compile(r'^([a-z]|_)*$')
# Strings with lower case chars and a ':'
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# Strings with chars that will cause problems as keys
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
# Function for creating nodes
def shape_element1(element):
node = {}
if element.tag == "node" or element.tag == "way":
address = {}
nd = []
node["type"] = element.tag
node["id"] = element.attrib["id"]
if "visible" in element.attrib.keys():
node["visible"] = element.attrib["visible"]
if "lat" in element.attrib.keys():
node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
node["created"] = {"version": element.attrib['version'],
"changeset": element.attrib['changeset'],
"timestamp": element.attrib['timestamp'],
"uid": element.attrib['uid'],
"user": element.attrib['user']}
for tag in element.iter("tag"):
problems = problemchars.search(tag.attrib['k'])
if problems:
print "problemchars: ", problems.group()
continue
elif tag.attrib['k'][:5] == "addr:":
if ":" in tag.attrib['k'][5:]:
continue
else:
address[tag.attrib['k'][5:]] = tag.attrib['v']
else:
node[tag.attrib['k']] = tag.attrib['v']
if address != {}:
node['address'] = address
for tag2 in element.iter("nd"):
nd.append(tag2.attrib['ref'])
if nd != []:
node['node_refs'] = nd
return node
else:
return None
# Function for creating the .json file
def process_map1(file_in, pretty = False):
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element1(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
hide_code
import schema
SCHEMA = schema.Schema
hide_code
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
hide_code
def shape_element(element, node_attr_fields = NODE_FIELDS, way_attr_fields = WAY_FIELDS,
problem_chars = PROBLEMCHARS, default_tag_type = 'regular'):
"""Clean and shape node or way XML element to Python dict"""
node_attribs = {}
way_attribs = {}
way_nodes = []
tags = [] # Handle secondary tags the same way for both node and way elements
if element.tag == 'node':
for node in NODE_FIELDS:
node_attribs[node] = element.attrib[node]
for child in element:
tag = {}
if PROBLEMCHARS.search(child.attrib["k"]):
continue
elif LOWER_COLON.search(child.attrib["k"]):
tag_type = child.attrib["k"].split(':',1)[0]
tag_key = child.attrib["k"].split(':',1)[1]
tag["key"] = tag_key
if tag_type:
tag["type"] = tag_type
else:
tag["type"] = 'regular'
tag["id"] = element.attrib["id"]
tag["value"] = child.attrib["v"]
else:
tag["value"] = child.attrib["v"]
tag["key"] = child.attrib["k"]
tag["type"] = "regular"
tag["id"] = element.attrib["id"]
if tag:
tags.append(tag)
return {'node': node_attribs, 'node_tags': tags}
elif element.tag == 'way':
for way in WAY_FIELDS:
way_attribs[way] = element.attrib[way]
for child in element:
nd = {}
tag = {}
if child.tag == 'tag':
if PROBLEMCHARS.search(child.attrib["k"]):
continue
elif LOWER_COLON.search(child.attrib["k"]):
tag_type = child.attrib["k"].split(':',1)[0]
tag_key = child.attrib["k"].split(':',1)[1]
tag["key"] = tag_key
if tag_type:
tag["type"] = tag_type
else:
tag["type"] = 'regular'
tag["id"] = element.attrib["id"]
tag["value"] = child.attrib["v"]
else:
tag["value"] = child.attrib["v"]
tag["key"] = child.attrib["k"]
tag["type"] = "regular"
tag["id"] = element.attrib["id"]
if tag:
tags.append(tag)
elif child.tag == 'nd':
nd['id'] = element.attrib["id"]
nd['node_id'] = child.attrib["ref"]
nd['position'] = len(way_nodes)
if nd:
way_nodes.append(nd)
else:
continue
return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
hide_code
# ================================================== #
# Helper Functions #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def validate_element(element, validator, schema=SCHEMA):
"""Raise ValidationError if element does not match schema"""
if validator.validate(element, schema) is not True:
field, errors = next(validator.errors.iteritems())
message_string = "\nElement of type '{0}' has the following errors:\n{1}"
error_strings = (
"{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
for k, v in errors.iteritems()
)
raise cerberus.ValidationError(
message_string.format(field, "\n".join(error_strings))
)
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
hide_code
def process_map(file_in, validate):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, \
codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file, \
codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if validate is True:
validate_element(el, validator)
if element.tag == 'node':
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags'])
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])
I have chosen the map sector of the dynamically developing area in the UAE.
For displaying the area I have used the package "folium" and the coordinates of this area in dubai_abu-dhabi.osm.
hide_code
# Display the coordinates of bounds from .osm file
HTML('<h4>bounds minlat="23.7350" minlon="53.5800" maxlat="26.5390" maxlon="56.8870"</h4>')
hide_code
# Setup the coordinates of the map center and the zoom option.
map_osm = folium.Map(location=[25.2048, 55.2708], zoom_start=8)
# Add labels with coordinates.
folium.LatLngPopup().add_to(map_osm)
# Setup the coordinates of the map area.
points=[[23.7350, 53.5800], [23.7350, 56.8870], [26.5390, 56.8870], [26.5390, 53.5800], [23.7350, 53.5800]]
# Setup the border line with options.
folium.PolyLine(points, color="red", weight=5, opacity=0.3).add_to(map_osm)
# Display the map.
map_osm
There are several ways to extract geodata. One of them is to do this with this python code cell. This set of commands allows us to upload a file in the format .osm using the coordinates of the rectangle corners.
# Extract from overpass-api.de
file00 = urllib.URLopener()
file00.retrieve("http://overpass-api.de/api/map? bbox=53.5800,23.7350,56.8870,26.5390", "dubai_abu-dhabi0.osm")
Another possible way is extracting data files in many different formats from the website:
https://mapzen.com/data/metro-extracts/metro/dubai_abu-dhabi/ .
The files dubai_abu-dhabi.osm, dubai_abu-dhabi_buildings.geojson, etc. were downloaded.
hide_code
# Setup file directories and names of file variables
filedir1 = '/Users/olgabelitskaya/large-repo/'
filedir2 = '/Users/olgabelitskaya/large-repo/dubai_abu-dhabi.imposm-geojson/'
filedir3 = '/Users/olgabelitskaya/large-repo/dubai_abu-dhabi.imposm-shapefiles/'
file0 = filedir1 + 'dubai_abu-dhabi0.osm'
file1 = filedir1 + 'dubai_abu-dhabi.osm'
file2 = filedir2 + 'dubai_abu-dhabi_admin.geojson'
file3 = filedir2 + 'dubai_abu-dhabi_roads.geojson'
file4 = filedir2 + 'dubai_abu-dhabi_waterareas.geojson'
file5 = filedir2 + 'dubai_abu-dhabi_buildings.geojson'
file6 = filedir2 + 'dubai_abu-dhabi_amenities.geojson'
hide_code
# Get size of the .osm files
print "Size of files"
print "dubai_abu-dhabi0.osm: ", os.path.getsize(file0)
print "dubai_abu-dhabi.osm: ", os.path.getsize(file1)
# Get size of the .geojson files
print "dubai_abu-dhabi_admin.geojson: ", os.path.getsize(file2)
print "dubai_abu-dhabi_roads.geojson: ", os.path.getsize(file3)
print "dubai_abu-dhabi_waterareas.geojson: ", os.path.getsize(file4)
print "dubai_abu-dhabi_buildings.geojson: ", os.path.getsize(file5)
print "dubai_abu-dhabi_amenities.geojson: ", os.path.getsize(file6)
This is not so large piece of data to process (394,4 MB) in the dubai_abu-dhabi .osm file and for me it is a very interesting subject for reseach because of many reasons.
For example, it is a constant and rapidly changing territory with awesome ideas about area development.
Applying the special function (§ 0.3) I created the sample_dubai_abu-dhabi.osm file from the dubai_abu-dhabi .osm file.
hide_code
# Setup the file for sample extraction
OSM_FILE = file1
# Setup the name for the file with a sample
SAMPLE_FILE = "sample_dubai_abu-dhabi.osm"
hide_code
# Create a sample file
k = 100 # Parameter: take every k-th top level element
with open(SAMPLE_FILE, 'wb') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
# Write every kth top level element
for i, element in enumerate(get_element(OSM_FILE)):
if i % k == 0:
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>')
hide_code
# Setup the file directory and the name of a file variable
file8 = filedir1 + 'sample_dubai_abu-dhabi.osm'
hide_code
# Get size of the created .osm file
print "Size of sample_dubai_abu-dhabi.osm: ", os.path.getsize(file8)
It's possible to download from OpenStreetMap several type of files: .osm, .geojson, etc.
For displaying the data in .geojson files the package "geopandas" also can be useful. As an example you can see the map of administrative borders, roads and water areas.
hide_code
# Read the .geojson files
df_admin = gpd.read_file(file2)
hide_code
df_roads = gpd.read_file(file3)
hide_code
df_waterareas = gpd.read_file(file4)
hide_code
df_buildings = gpd.read_file(file5)
hide_code
df_amenities = gpd.read_file(file6)
hide_code
print "The dimensionality of the data"
print "dataframe for admin borders:", df_admin.shape
print "dataframe for roads:", df_roads.shape
print "dataframe for water areas:", df_waterareas.shape
print "dataframe for buildings:", df_buildings.shape
print "dataframe for amenities:", df_amenities.shape
hide_code
print "Displaying the examples of these data frames"
df_admin.head(3)
hide_code
# Setup the size of the image
matplotlib.rcParams['figure.figsize'] = (14, 14)
# plt.figure(figsize=(14,14))
# Print map
df_admin.plot()
plt.show()
hide_code
df_roads.head(3)
hide_code
matplotlib.rcParams['figure.figsize'] = (14, 14)
df_roads.plot()
plt.show()
hide_code
df_waterareas.head(3)
hide_code
matplotlib.rcParams['figure.figsize'] = (14, 14)
df_waterareas.plot()
plt.show()
hide_code
df_buildings.head(3)
hide_code
matplotlib.rcParams['figure.figsize'] = (14, 14)
df_buildings.plot()
plt.show()
hide_code
df_amenities.head(3)
hide_code
matplotlib.rcParams['figure.figsize'] = (14, 14)
df_amenities.plot()
plt.show()
For displaying the data in shapefiles it's possible to apply the package "basemap". As an example you can see the map of roads and aeroways.
hide_code
# Setup the size of the image
matplotlib.rcParams['figure.figsize'] = (14, 14)
# Setup the colors for surfaces
water = 'lightskyblue'
earth = 'cornsilk'
# Create a map
mm = Basemap(llcrnrlon=53.58, llcrnrlat=23.73, urcrnrlon=56.89, urcrnrlat=26.53,
ellps='WGS84', resolution='i', projection='cass', lat_0 = 25.0756, lon_0 = 55.3821)
# Variables for drawing map components
coast = mm.drawcoastlines()
rivers = mm.drawrivers(color=water, linewidth=1)
continents = mm.fillcontinents(color=earth,lake_color=water)
bound= mm.drawmapboundary(fill_color=water)
countries = mm.drawcountries()
merid = mm.drawmeridians(np.arange(-180, 180, 0.5), labels=[False, False, False, True])
parall = mm.drawparallels(np.arange(0, 80, 0.5), labels=[True, True, False, False])
# Read shapefiles
mm.readshapefile('/Users/olgabelitskaya/large-repo/dubai_abu-dhabi.imposm-shapefiles/dubai_abu-dhabi_osm_roads',
name='roads', drawbounds=True, color='grey')
mm.readshapefile('/Users/olgabelitskaya/large-repo/dubai_abu-dhabi.imposm-shapefiles/dubai_abu-dhabi_osm_aeroways',
name='aeroways', drawbounds=True, color='blue')
# Display the map
plt.show()
Applying the special function (§ 0.3) I created the dubai_abu-dhabi.osm.json from the dubai_abu-dhabi.osm file.
hide_code
# Extract data from the dataset in the .osm format as json files
# data1 = process_map(file1)
hide_code
# Setup the variable for the .json file
file7 = filedir1 + 'dubai_abu-dhabi.osm.json'
hide_code
# Get size of the .json file
print "size of dubai_abu-dhabi.osm.json: ", os.path.getsize(file7)
hide_code
# Creating csv files
if __name__ == '__main__':
# Note: Validation is ~ 10X slower. For the project consider using a small
# sample of the map when validating.
process_map(file1, validate=False)
hide_code
print "size of nodes.csv: ", os.path.getsize('nodes.csv')
print "size of nodes_tags.csv: ", os.path.getsize('nodes_tags.csv')
print "size of ways.csv: ", os.path.getsize('ways.csv')
print "size of ways_tags.csv: ", os.path.getsize('ways_tags.csv')
print "size of ways_nodes.csv: ", os.path.getsize('ways_nodes.csv')
hide_code
df_admin.to_csv('admin.csv', sep='\t', encoding='utf-8')
hide_code
df_roads.to_csv('roads.csv', sep='\t', encoding='utf-8')
hide_code
df_waterareas.to_csv('waterareas.csv', sep='\t', encoding='utf-8')
hide_code
df_buildings.to_csv('buildings.csv', sep='\t', encoding='utf-8')
hide_code
df_amenities.to_csv('amenities.csv', sep='\t', encoding='utf-8')
hide_code
print "size of admin.csv: ", os.path.getsize('admin.csv')
print "size of roads.csv: ", os.path.getsize('roads.csv')
print "size of waterareas.csv: ", os.path.getsize('waterareas.csv')
print "size of buildings.csv: ", os.path.getsize('buildings.csv')
print "size of amenities.csv: ", os.path.getsize('amenities.csv')
Let's discover the data in .osm files in details. It contains a lot of information of geographical objects.
OpenStreetMap represents physical features on the ground (e.g., roads or buildings) using tags attached to its basic data structures (its nodes, ways, and relations). Tags help describe an item and allow it to be found again by browsing or searching. They are chosen by the item's creator depending on the data point.
hide_code
# Count tags
print count_tags(file1)
Map data is collected from zero by volunteers (users). We can get the number and the list of them for this piece of the data.
hide_code
# Count users of the map editing
users1 = process_map_users(file1)
hide_code
# Display number of users
print "Number of users -", len(users1)
# Display example of the user list
user_list = list(users1)
print sorted(user_list)[:50]
Exploring the digital data in this file, we can get a large number of other statistics and information.
hide_code
# Count keys by types
process_map_keys(file1)
hide_code
# Count street addresses
street_number(file1)
hide_code
# Count places by types
print process_map_places(file1)
hide_code
# Count names in english with values
english_names1 = process_map_names(file1)
hide_code
print "The number of names in English: ", english_names1[0]
hide_code
print list(english_names1[1])[:50]
On this map it may be noted a large number of duplicate names in English.
In UAE mail is usually delivered to a P.O Box. As we can see practically all postcodes are individual. Let's display the list of P.O Boxes and the number of users for each of them (1-2 in the most of cases).
hide_code
print "The number of postcodes:", zip_codes(file1)[0]
hide_code
# Audit P.O Box
postcode1 = process_map_postcodes(file1)
hide_code
# Display P.O Box
print postcode1
hide_code
# Audit street names
street_names1 = audit_street_names(file1)
hide_code
# Display street names
pprint.pprint(dict(street_names1))
hide_code
# Update street names
for street_type, ways in street_names1.iteritems():
for name in ways:
better_name = update_name(name, mapping, street_type_re)
print name, "=>", better_name
More accurate correction is possible by comparison with data from other map sites and in the studying of the real situation.
From csv files we can create separated data frames. Let's have a look on them.
hide_code
print "Nodes: ", len(nodes)
matplotlib.rcParams['figure.figsize'] = (6, 4)
plt.style.use('seaborn-pastel')
nodes.timestamp.hist()
print "Histogram for node timestamps"
# Setup variables for created from csv files data frames
nodes = odo('nodes.csv', pd.DataFrame)
ways = odo('ways.csv', pd.DataFrame)
nodes_tags = odo('nodes_tags.csv', pd.DataFrame)
ways_tags = odo('ways_tags.csv', pd.DataFrame)
ways_nodes = odo('ways_nodes.csv', pd.DataFrame)
hide_code
nodes.head(1)
hide_code
nodes_tags.head(1)
print "List of keys for node tags"
print sorted(set(nodes_tags['key']))
hide_code
print "List of values for node tags 'wifi'"
print set(nodes_tags[nodes_tags['key'] == 'wifi']['value'])
hide_code
print "List of values for node tags 'place'"
print sorted(set(nodes_tags[nodes_tags['key'] == 'place']['value']))
hide_code
print "List of values for node and way tags 'street'"
streets_nodes = set(nodes_tags[nodes_tags['key'] == 'street']['value'])
streets_ways = set(ways_tags[ways_tags['key'] == 'street']['value'])
print "Number of streets in table 'nodes': ", len(streets_nodes), ';', \
"number of streets in table 'ways': ", len(streets_ways)
print
print sorted(streets_nodes.union(streets_ways))
hide_code
print "Ways: ", len(ways)
ways.head(3)
hide_code
print "Example of way tags"
ways_tags.head(3)
hide_code
print "Example of ways-nodes connections"
ways_nodes.head(3)
We can manage the data with special tools as well.
The set of comands in the package "odo" allowed to download information from the files to the SQL database.
hide_code
# Import the table into the sql database
dshape_nodes = discover(resource('nodes.csv'))
nodes_sql = odo('nodes.csv', 'sqlite:///openstreetmap_dubai.db::nodes', dshape=dshape_nodes)
hide_code
dshape_ways = discover(resource('ways.csv'))
ways_sql = odo('ways.csv', 'sqlite:///openstreetmap_dubai.db::ways', dshape=dshape_ways)
hide_code
dshape_nodes_tags = discover(resource('nodes_tags.csv'))
nodes_tags_sql = odo('nodes_tags.csv', 'sqlite:///openstreetmap_dubai.db::nodes_tags', dshape=dshape_nodes_tags)
hide_code
dshape_ways_tags = discover(resource('ways_tags.csv'))
ways_tags_sql = odo('ways_tags.csv', 'sqlite:///openstreetmap_dubai.db::ways_tags', dshape=dshape_ways_tags)
hide_code
dshape_ways_nodes = discover(resource('ways_nodes.csv'))
ways_nodes_sql = odo('ways_nodes.csv', 'sqlite:///openstreetmap_dubai.db::ways_nodes', dshape=dshape_ways_nodes)
hide_code
conn = sqlite3.connect('openstreetmap_dubai.db')
c = conn.cursor()
The number of nodes:
c.execute("SELECT COUNT(*) FROM nodes;")
print c.fetchall()
The number of ways:
c.execute("SELECT COUNT(*) FROM ways;")
print c.fetchall()
hide_code
conn.close()
hide_code
conn = sqlite3.connect('openstreetmap_dubai.db')
c = conn.cursor()
With the help of simple manipulations in the database, the user can perform a selection of interesting information.
c.execute("SELECT COUNT(DISTINCT(e.uid)) FROM \
(SELECT uid FROM nodes UNION ALL SELECT uid FROM ways) e;")
print c.fetchall()
The database allows to evaluate the contribution of each individual user in map editing.
Let us list the 3 most active editors of this map section:
c.execute("SELECT e.user, COUNT(*) as num \
FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e \
GROUP BY e.user \
ORDER BY num DESC \
LIMIT 3;")
print c.fetchall()
The examples of the 10 users with only one note:
c.execute("SELECT e.user, COUNT(*) as num \
FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e \
GROUP BY e.user \
ORDER BY num \
LIMIT 10;")
print c.fetchall()
hide_code
conn.close()
hide_code
conn = sqlite3.connect('openstreetmap_dubai.db')
c = conn.cursor()
The list of the 3 most common types of places:
c.execute("SELECT value, COUNT(*) as num \
FROM nodes_tags \
WHERE key='place' \
GROUP BY value \
ORDER BY num DESC \
LIMIT 3;")
print c.fetchall()
The list of building types:
c.execute("SELECT value, COUNT(*) as num \
FROM nodes_tags \
WHERE key='building' \
GROUP BY value \
ORDER BY num DESC;")
building_types = c.fetchall()
print building_types
print(plt.style.available)
building_df = pd.DataFrame( [[ij for ij in i] for i in building_types] )
matplotlib.rcParams['figure.figsize'] = (14, 5)
x = building_df.index.values
y = building_df[1]
xticks = building_df[0]
plt.xticks(x, xticks, rotation=60)
plt.style.use('seaborn-pastel')
plt.scatter(x, y, marker = 'H', s = 100, alpha = 0.5)
The list of the 10 most common facilities:
c.execute("SELECT value, COUNT(*) as num \
FROM nodes_tags \
WHERE key='amenity' \
GROUP BY value \
ORDER BY num DESC \
LIMIT 10;")
print c.fetchall()
The list of the 3 most common zipcodes:
c.execute("SELECT value, COUNT(*) as num \
FROM nodes_tags \
WHERE key='postcode' \
GROUP BY value \
ORDER BY num DESC \
LIMIT 3;")
print c.fetchall()
Counting zipcodes with one document:
c.execute("SELECT value, COUNT(*) as num \
FROM nodes_tags \
WHERE key='postcode' \
GROUP BY value \
ORDER BY num \
LIMIT 70;")
print c.fetchall()
The list of the 20 most common streets:
c.execute("SELECT value, COUNT(*) as num \
FROM nodes_tags \
WHERE key='street' \
GROUP BY value \
ORDER BY num DESC \
LIMIT 20;")
print c.fetchall()
hide_code
conn.close()
One of the main problems of public maps - no duplication of all place names in other languages. If it were possible to automate the translation process by increasing a common database of map names in many languages, it would save users from many difficulties and mistakes.
The next problem - the presence of a large number of databases (including mapping) on the same map objects. Some intergraph procedures of already available data would relieve a lot of people from unnecessary work, save time and effort.
Obviously, the information about the number of buildings and their purpose is incomplete. Completeness of public maps can be increased by bringing in the process of mapping new users. For this goal enter the information should be as simple as possible: for example, a choice of the available options with automatic filling many fields for linked options (for example, linking the name of the street and the administrative area in which it is located).
There are a number of mistakes and typos as in every public data. For correction them well-known methods can be proposed: automatic comparison with existing data and verification for new data by other users.
The lack of a uniform postal code system in this concrete dataset complicates their identification and verification.
During working on the project, I spent a lot of time on the conversion of one type of data file to another. Each format has its own advantages and disadvantages. Probably, it is possible to design a universal file type that allows us to store data of any kind, combining the advantages of all existing types and applicable in the most of existing programming languages.
Correction of errors made in the data seems to me appropriate to carry out after uploading files to the database. Sometimes a record that is a mistake in terms of filling a particular type of data just contains additional information about geoobjects.
1) nodes - points in space with basic characteristics (lat, long, id, tags);
2) ways - defining linear features and area boundaries (an ordered list of nodes);
3) relations - tags and also an ordered list of nodes, ways and/or relations as members which is used to define logical or geographic relationships between other elements.
1) Size of the .osm file: 394,4 MB.
2) Size of the .osm sample file : 3,9 MB.
3) Nodes: 1890178.
4) Ways: 234327.
5) Relations: 2820.
6) Tags: 503027.
7) Users: 1895.
With the help of a specific set of commands we can perform a statistical description of the data collections and the database.
I think this project is educational for me. I believe that one of the main tasks in this case was to study the methods of extraction and researching of map data in open access. For example, I used a systematic sample of elements from the original .osm file for trying functions of processing before applying them to the whole dataset. As a result I have some new useful skills in parsing, processing, storing, aggregating and applying the data.
In the research I have read through quite a lot of projects of other students on this topic. After my own research and review the results of other authors I have formed a definite opinion about the ideas in OpenStreetMap.
This website can be viewed as a testing ground of interaction of a large number of people (ncluding non-professionals) to create a unified information space. The prospects of such cooperation can not be overemphasized. The success of the project will allow to implement the ambitious plans in the field of available information technologies, the creation of virtual reality and many other areas.
Increasing of the number of users leads to many positive effects in this kind of projects:
1) a rapid improvement in the accuracy, completeness and timeliness of information;
2) approximation of the information space to the reality , the objectivity of the data evaluation;
3) reduce the effort for data cleansing on erroneous details.
Ideas for improving the project OpenStreetMap are simple and natural.
Increasing the number of users can be achieved by additional options like marks of the rating evaluation (eg, the best restaurant or the most convenient parking).
The popularity of the project may be more due to the temporary pop-up messages of users (placement is not more than 1-3 hours) with actual information about the geographic location (eg, the presence of traffic jams).