from ipykernel import kernelapp as app
from IPython.core.display import display, HTML
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import folium
import geopandas as gpd
from mpl_toolkits.basemap import Basemap
import xml.etree.cElementTree as ET
from collections import defaultdict as dfdict
import numpy as np
import pprint
import urllib
import re
import os
import csv
import cerberus
import json
import codecs
import signal
import subprocess
pro = subprocess.Popen('mongod', preexec_fn = os.setsid)
from pymongo import MongoClient
db_name = 'openstreetmap'
# Connect to Mongo DB
client = MongoClient('localhost:27017')
db = client[db_name]
from IPython.display import Image
import collections
import sqlite3
import geopandas.io
from sqlalchemy import create_engine
import pandas as pd
import io
# Function for counting tags
def count_tags(filename):
count = dfdict(int)
for item in ET.iterparse(filename):
count[item[1].tag] += 1
return count
# Functions for counting users
def get_user(element):
return
def process_map_users(filename):
users = set()
for _, element in ET.iterparse(filename):
if element.tag == 'node' or element.tag == 'way' or element.tag == 'relation':
users.add(element.attrib['user'])
return users
# Strings containing lower case chars
lower = re.compile(r'^([a-z]|_)*$')
# Strings with lower case chars and a ':'
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# Strings with chars that will cause problems as keys.
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
# Function for sorting by key type
def key_type(element, keys):
if element.tag == "tag":
if lower.search(element.attrib['k']) != None:
keys['lower'] += 1
elif lower_colon.search(element.attrib['k']) != None:
keys['lower_colon'] += 1
elif problemchars.search(element.attrib['k']) != None:
keys['problemchars'] += 1
else:
keys['other'] += 1
return keys
# Function for counting keys by type
def process_map_keys(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
# Function for counting address attributes by type
def address_attribute(filename):
address_attributes={}
for event, elem in ET.iterparse(filename):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if re.search(re.compile("addr:.*$"),tag.get("k")):
if tag.get("k") in address_attributes:
address_attributes[tag.get("k")]+=1
else:
address_attributes[tag.get("k")]=1
return address_attributes
# Function for counting street addresses
def street_number(file_name):
count = 0
for event, elem in ET.iterparse(file_name, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "addr:street":
count += 1
return count
# Function for counting zip codes.
def zip_codes(filename):
count = 0
data = set()
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "addr:postcode":
count += 1
data.add( tag.attrib['v'] )
return count, data
# Functions for auditing zip codes
expected=list(range(140000,143000) + range(144000, 145000) + range(600000, 603000))
expected=map(str, expected)
def audit_postcode_range(postcode, wrong_postcode, tag):
if tag.attrib["v"] in expected:
if tag.attrib["v"] not in postcode:
postcode[tag.attrib["v"]]=1
else:
postcode[tag.attrib["v"]]+=1
else:
if tag.attrib["v"] not in wrong_postcode:
wrong_postcode[tag.attrib["v"]]=1
else:
wrong_postcode[tag.attrib["v"]]+=1
def is_postcode(elem):
return (elem.attrib['k'] == "addr:postcode")
def process_map_postcodes(filename):
postcode={}
wrong_postcode={}
osm_file = open(filename, "r")
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_postcode(tag):
audit_postcode_range(postcode, wrong_postcode, tag)
return postcode, wrong_postcode
# Functions for correction zip codes
def update_postcode(tag):
tag.attrib["v"]=None
return tag
def correct_postcode(filename):
osm_file = open(filename, "r")
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_postcode(tag) and tag.attrib["v"] == 'RU':
update_postcode(tag)
# Function for displaying english names
def process_map_names(filename):
count = 0
data = set()
for event, elem in ET.iterparse(filename, events=("start",)):
if elem.tag == 'node' or elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == "name:en":
count += 1
data.add( tag.attrib['v'] )
return count, data
# Function for sorting by place
def place_type(element, places):
if element.tag == "node":
for tag in element.iter('tag'):
if tag.attrib['k'] == 'place':
if tag.attrib['v'] == 'city':
places['city'] += 1
elif tag.attrib['v'] == 'town':
places['town'] += 1
elif tag.attrib['v'] == 'village':
places['village'] += 1
elif tag.attrib['v'] == 'hamlet':
places['hamlet'] += 1
else:
places['other'] += 1
return places
# Function for counting places by type
def process_map_places(filename):
places = {"city": 0, "town": 0, "village": 0, "hamlet" : 0, "other": 0}
for _, element in ET.iterparse(filename):
places = place_type(element, places)
return places
# Functions for creating the sample file
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag
Reference:
http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
"""
context = iter(ET.iterparse(osm_file, events=('start', 'end')))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
# Schema for validating elements
schema1 = {
'node': {
'type': 'dict',
'schema': {
'id': {'required': True, 'type': 'integer', 'coerce': int},
'lat': {'required': True, 'type': 'float', 'coerce': float},
'lon': {'required': True, 'type': 'float', 'coerce': float},
'user': {'required': True, 'type': 'string'},
'uid': {'required': True, 'type': 'integer', 'coerce': int},
'version': {'required': True, 'type': 'string'},
'changeset': {'required': True, 'type': 'integer', 'coerce': int},
'timestamp': {'required': True, 'type': 'string'}
}
},
'node_tags': {
'type': 'list',
'schema': {
'type': 'dict',
'schema': {
'id': {'required': True, 'type': 'integer', 'coerce': int},
'key': {'required': True, 'type': 'string'},
'value': {'required': True, 'type': 'string'},
'type': {'required': True, 'type': 'string'}
}
}
},
'way': {
'type': 'dict',
'schema': {
'id': {'required': True, 'type': 'integer', 'coerce': int},
'user': {'required': True, 'type': 'string'},
'uid': {'required': True, 'type': 'integer', 'coerce': int},
'version': {'required': True, 'type': 'string'},
'changeset': {'required': True, 'type': 'integer', 'coerce': int},
'timestamp': {'required': True, 'type': 'string'}
}
},
'way_nodes': {
'type': 'list',
'schema': {
'type': 'dict',
'schema': {
'id': {'required': True, 'type': 'integer', 'coerce': int},
'node_id': {'required': True, 'type': 'integer', 'coerce': int},
'position': {'required': True, 'type': 'integer', 'coerce': int}
}
}
},
'way_tags': {
'type': 'list',
'schema': {
'type': 'dict',
'schema': {
'id': {'required': True, 'type': 'integer', 'coerce': int},
'key': {'required': True, 'type': 'string'},
'value': {'required': True, 'type': 'string'},
'type': {'required': True, 'type': 'string', 'required': True}
}
}
}
}
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['changeset', 'id', 'timestamp', 'uid', 'user', 'version']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
problem_chars=PROBLEMCHARS, default_tag_type='regular'):
"""Clean and shape node or way XML element to Python dict"""
if element.tag == 'node':
node_attributes = {}
tags = []
for unit in NODE_FIELDS:
node_attributes[unit] = element.attrib[unit]
for tag in element.iter('tag'):
problem = PROBLEMCHARS.search(tag.attrib['k'])
if not problem:
node_tag_dictionary = {}
node_tag_dictionary['id'] = element.attrib['id']
node_tag_dictionary['value'] = tag.attrib['v']
point = LOWER_COLON.search(tag.attrib['k'])
if not point:
node_tag_dictionary['type'] = 'regular'
node_tag_dictionary['key'] = tag.attrib['k']
else:
before = re.findall('^(.+):', tag.attrib['k'])
after = re.findall('^[a-z]+:(.+)', tag.attrib['k'])
node_tag_dictionary['type'] = before[0]
node_tag_dictionary['key'] = after[0]
tags.append(node_tag_dictionary)
return {'node': node_attributes, 'node_tags': tags}
elif element.tag == 'way':
way_attributes = {}
way_nodes = []
tags = []
for unit in WAY_FIELDS:
way_attributes[unit] = element.attrib[unit]
for tag in element.iter('tag'):
problem = PROBLEMCHARS.search(tag.attrib['k'])
if not problem:
way_tag_dictionary = {}
way_tag_dictionary['id'] = element.attrib['id']
way_tag_dictionary['value'] = tag.attrib['v']
point = LOWER_COLON.search(tag.attrib['k'])
if not point:
way_tag_dictionary['type'] = 'regular'
way_tag_dictionary['key'] = tag.attrib['k']
else:
before = re.findall('^(.+):', tag.attrib['k'])
after = re.findall('^[a-z]+:(.+)', tag.attrib['k'])
way_tag_dictionary['type'] = before[0]
way_tag_dictionary['key'] = after[0]
tags.append(way_tag_dictionary)
for tag in element.iter("nd"):
way_nd_dictionary = {}
count = 0
way_nd_dictionary['id'] = element.attrib['id']
way_nd_dictionary['node_id'] = tag.attrib['ref']
way_nd_dictionary['position'] = count
count += 1
way_nodes.append(way_nd_dictionary)
return {'way': way_attributes, 'way_nodes': way_nodes, 'way_tags': tags}
# ================================================== #
# Helper Functions #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def validate_element(element, validator, schema=schema1):
"""Raise ValidationError if element does not match schema"""
if validator.validate(element, schema) is not True:
field, errors = next(validator.errors.iteritems())
message_string = "\nElement of type '{0}' has the following errors:\n{1}"
error_strings = (
"{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
for k, v in errors.iteritems()
)
raise cerberus.ValidationError(
message_string.format(field, "\n".join(error_strings))
)
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
# ================================================== #
# Main Function #
# ================================================== #
def process_map_csv(file_in, validate):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, \
codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file, \
codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if validate is True:
validate_element(el, validator)
if element.tag == 'node':
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags'])
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])
# if __name__ == '__main__':
# Note: Validation is ~ 10X slower. For the project consider using a small
# sample of the map when validating.
# Strings containing lower case chars
lower = re.compile(r'^([a-z]|_)*$')
# Strings with lower case chars and a ':'
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# Strings with chars that will cause problems as keys.
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
node = {}
if element.tag == "node" or element.tag == "way":
address = {}
nd = []
node["type"] = element.tag
node["id"] = element.attrib["id"]
if "visible" in element.attrib.keys():
node["visible"] = element.attrib["visible"]
if "lat" in element.attrib.keys():
node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
node["created"] = {"version": element.attrib['version'],
"changeset": element.attrib['changeset'],
"timestamp": element.attrib['timestamp'],
"uid": element.attrib['uid'],
"user": element.attrib['user']}
for tag in element.iter("tag"):
p = problemchars.search(tag.attrib['k'])
if p:
print "problemchars: ", p.group()
continue
elif tag.attrib['k'][:5] == "addr:":
if ":" in tag.attrib['k'][5:]:
continue
else:
address[tag.attrib['k'][5:]] = tag.attrib['v']
else:
node[tag.attrib['k']] = tag.attrib['v']
if address != {}:
node['address'] = address
for tag2 in element.iter("nd"):
nd.append(tag2.attrib['ref'])
if nd != []:
node['node_refs'] = nd
return node
else:
return None
# Function for creating the .json file
def process_map(file_in, pretty = False):
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
# Display the coordinates of bounds from .osm file
HTML('<h4>bounds minlat="55.5037498" minlon="38.3944702" maxlat="56.1011515" maxlon="39.5617675"</h4>')
# Setup the coordinates of the map center and the zoom option.
map_osm = folium.Map(location=[55.7986, 38.9754], zoom_start=9)
# Add labels with coordinates.
folium.LatLngPopup().add_to(map_osm)
# Setup the coordinates of the map area.
points=[[55.5037, 38.3945], [55.5037, 39.5618], [56.1012, 39.5618], [56.1012, 38.3945], [55.5037, 38.3945]]
# Setup the border line with options.
folium.PolyLine(points, color="red", weight=5, opacity=0.3).add_to(map_osm)
# Display the map.
map_osm
# Extract from overpass-api.de
file00 = urllib.URLopener()
file00.retrieve("http://overpass-api.de/api/map? bbox=38.3945,55.5037,39.5618,56.1012", "moscow_region0.osm")
https://mapzen.com/data/metro-extracts/ .
# Setup file directories and names of file variables
filedir1 = '/Users/olgabelitskaya/large-repo/'
filedir2 = '/Users/olgabelitskaya/large-repo/moscow_region.imposm-geojson/'
filedir3 = '/Users/olgabelitskaya/large-repo/moscow_region.imposm-shapefiles/'
file0 = filedir1 + 'moscow_region0.osm'
file1 = filedir1 + 'moscow_region.osm'
file2 = filedir2 + 'moscow_region_admin.geojson'
file3 = filedir2 + 'moscow_region_roads.geojson'
file4 = filedir2 + 'moscow_region_waterareas.geojson'
# Get size of the .osm files
os.path.getsize(file0)
os.path.getsize(file1)
# Get size of the .geojson files
os.path.getsize(file2)
os.path.getsize(file3)
os.path.getsize(file4)
# Setup the file for sample extraction
OSM_FILE = file1
# Setup the name for the file with a sample
SAMPLE_FILE = "sample.osm"
# Create a sample file
k = 100 # Parameter: take every k-th top level element
with open(SAMPLE_FILE, 'wb') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
# Write every kth top level element
for i, element in enumerate(get_element(OSM_FILE)):
if i % k == 0:
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>')
# Setup the file directory and the name of a file variable
file5 = filedir1 + 'sample.osm'
# Get size of the created .osm file
os.path.getsize(file5)
# Read the .geojson files
df_admin = gpd.read_file(file2)
df_roads = gpd.read_file(file3)
df_waterareas = gpd.read_file(file4)
# Get the dimensionality of the data
df_admin.shape
df_roads.shape
df_waterareas.shape
# Display examples of data rows
df_admin.head(3)
df_roads.head(3)
df_waterareas.head(3)
# Setup the size of the image
# matplotlib.rcParams['figure.figsize'] = (14, 14)
plt.figure(figsize=(14,14))
# Print map
df_admin.plot()
plt.show()
plt.figure(figsize=(14,14))
df_roads.plot()
plt.show()
plt.figure(figsize=(14,14))
df_waterareas.plot()
plt.show()
# Setup the size of the image
plt.figure(figsize=(14,14))
# Setup the colors for surfaces
water = 'lightskyblue'
earth = 'cornsilk'
# Create a map
mm = Basemap(llcrnrlon=38.39, llcrnrlat=55.50, urcrnrlon=39.56, urcrnrlat=56.10,
ellps='WGS84', resolution='i', projection='cass', lat_0 = 55.7986, lon_0 = 38.9754)
# Variables for drawing map components
coast = mm.drawcoastlines()
rivers = mm.drawrivers(color=water, linewidth=1)
continents = mm.fillcontinents(color=earth,lake_color=water)
bound= mm.drawmapboundary(fill_color=water)
countries = mm.drawcountries()
merid = mm.drawmeridians(np.arange(-180, 180, 0.2), labels=[False, False, False, True])
parall = mm.drawparallels(np.arange(0, 80, 0.2), labels=[True, True, False, False])
# Read shapefiles
mm.readshapefile('/Users/olgabelitskaya/large-repo/moscow_region.imposm-shapefiles/moscow_region_osm_admin',
name='admin', drawbounds=True, color='red')
mm.readshapefile('/Users/olgabelitskaya/large-repo/moscow_region.imposm-shapefiles/moscow_region_osm_roads',
name='roads', drawbounds=True, color='grey')
mm.readshapefile('/Users/olgabelitskaya/large-repo/moscow_region.imposm-shapefiles/moscow_region_osm_waterareas',
name='waterareas', drawbounds=True, color='cornflowerblue')
# Display the map
plt.show()
# Setup variables for extracting csv files (sample)
OSM_PATH = file5
NODES_PATH = "nodes0.csv"
NODE_TAGS_PATH = "nodes_tags0.csv"
WAYS_PATH = "ways0.csv"
WAY_NODES_PATH = "ways_nodes0.csv"
WAY_TAGS_PATH = "ways_tags0.csv"
# Extract data from sample.osm as csv files
# process_map_csv(OSM_PATH, validate=True)
# Setup variables for extracting csv files (dataset)
OSM_PATH = file1
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
# Extract data from the dataset in the .osm format as csv files
# process_map_csv(OSM_PATH, validate=False)
# Extract data from the dataset in the .osm format as json files
data1 = process_map(file1)
# Setup the variable for the .json file
file7 = filedir1 + 'moscow_region.osm.json'
# Get size of the .json file
os.path.getsize(file7)
# Count tags
count_tags(file1)
# Count users of the map editing
users1 = process_map_users(file1)
# Display number of users
print "Number of users -", len(users1)
# Display example of the user list
user_list = list(users1)
print sorted(user_list)[:50]
# Count keys by types
process_map_keys(file1)
# Count address attributes by type
address_attribute(file1)
# Count street addresses
street_number(file1)
# Count places by types
process_map_places(file1)
# Count names in english with values
english_names1 = process_map_names(file1)
# Display english names
print english_names1
# Display zip codes
print zip_codes(file1)
# Audit zip codes
postcode1, wrong_postcode1 = process_map_postcodes(file1)
# Display number of unexpected zip codes
wrong_postcode1
correct_postcode(file1)
postcode1, wrong_postcode1 = process_map_postcodes(file1)
wrong_postcode1
# Build mongoimport command
collection = file1[:file1.find('.')]
mongoimport_cmd = 'mongoimport -h 127.0.0.1:27017 ' + '--db ' + db_name + \
' --collection ' + collection + ' --file ' + file7
# Drop collection (if it's already running)
if collection in db.collection_names():
print 'Dropping collection: ' + collection
db[collection].drop()
# Execute the command
print 'Executing: ' + mongoimport_cmd
subprocess.call(mongoimport_cmd.split())
# Create mongo db
moscow_region = db[collection]
# Display an example of documents
moscow_region.find_one()
# Count documents
moscow_region.find().count()
# Count nodes
moscow_region.find({'type':'node'}).count()
# Count ways
moscow_region.find({'type':'way'}).count()
# Count users
len(moscow_region.distinct('created.user'))
# Create list of users
user_list_mongo = moscow_region.distinct('created.user')
# Display some user names
print sorted(user_list_mongo)[:50]
# Count documents by user
moscow_region.find({"created.user": "Alexander Leschinsky"}).count()
# Display documents by user
for element in moscow_region.find({"created.user": "Alexander Leschinsky"}).sort("timestamp"):
print element
# Create a list of 3 top users
top_users = moscow_region.aggregate([
{ "$group" : {"_id" : "$created.user", "count" : { "$sum" : 1} } },
{ "$sort" : {"count" : -1} }, { "$limit" : 3 }
] )
list(top_users)
# Count users with one post
onetime_users = moscow_region.aggregate( [
{ "$group" : {"_id" : "$created.user", "count" : { "$sum" : 1} } },
{ "$group" : {"_id" : "$count", "num_users": { "$sum" : 1} } },
{ "$sort" : {"_id" : 1} }, { "$limit" : 1}
] )
list(onetime_users)
# Create a list of 10 users with one post
list_onetime_users = moscow_region.aggregate([
{ "$group" : {"_id" : "$created.user", "count" : { "$sum" : 1} } },
{ "$sort" : {"count" : 1} }, { "$limit" : 10 }
] )
print list(list_onetime_users)
# Create a list of 3 most common places
places = moscow_region.aggregate( [
{ "$match" : { "address.place" : { "$exists" : 1} } },
{ "$group" : { "_id" : "$address.place", "count" : { "$sum" : 1} } },
{ "$sort" : { "count" : -1}}, {"$limit":3}
] )
list(places)
# Create a list of 10 most common types of buildings
buildings = moscow_region.aggregate([
{'$match': {'building': { '$exists': 1}}},
{'$group': {'_id': '$building','count': {'$sum': 1}}},
{'$sort': {'count': -1}}, {'$limit': 10}
])
list(buildings)
# Create a list of 10 most common facilities
facilities = moscow_region.aggregate([
{'$match': {'amenity': {'$exists': 1}}},
{'$group': {'_id': '$amenity', 'count': {'$sum': 1}}},
{'$sort': {'count': -1}}, {'$limit': 10}
])
list(facilities)
# Create a list of 3 most common zipcodes
top_zipcodes = moscow_region.aggregate( [
{ "$match" : { "address.postcode" : { "$exists" : 1} } },
{ "$group" : { "_id" : "$address.postcode", "count" : { "$sum" : 1} } },
{ "$sort" : { "count" : -1}}, {"$limit": 3}
] )
list(top_zipcodes)
# Count zipcodes with one document
onetime_zipcodes = moscow_region.aggregate( [
{ "$group" : {"_id" : "$address.postcode", "count" : { "$sum" : 1} } },
{ "$group" : {"_id" : "$count", "count": { "$sum" : 1} } },
{ "$sort" : {"_id" : 1} }, { "$limit" : 1}
] )
list(onetime_zipcodes)
print 12.00/1503*100, "%"
# Get DB statistics
db.command("dbstats")
# Get collection names
db.collection_names()
# Get collection statistics
db.command("collstats", "/Users/olgabelitskaya/large-repo/moscow_region")