Open Source Approach: Python

Following the previous posting - Open Source Approach: R - we now look at basic GIS functions using Python packages. Although ESRI's arcpy gives us comprehensive GIS data models and methods without supplementing it with extra spatial packages, it still requires proprietary and expensive license to use them. This tutorial attempts to do the same Python spatial data analyses wihtout such proprietary technologies. Also, our focus will be on basic geospatial libraries in Python, especially GDAL/OGR and GEOS.

IMPORTANT! This tutorial is posted on my university webspace as iPython Notebook (http://home.uchicago.edu/~cmaene/OpenSourceApproach_Python.html) - with all outputs/graphics - please check the site for better viewing..

#!/user/bin/python
# -*- coding: utf-8 -*- python version 2.7.6

# load the libraries we will use in this tutorial..
import os, sys, ogr, osr, gdal
import matplotlib.pyplot as plt
# to make iPython notebook to produce plots inline
%matplotlib inline

Preparation - download and unzip data, set your working directory.

# set the working directory
os.chdir('/home/chieko/Documents/Python')

## download the tutorial data - skip this part if you've done this already
## need urllib if you download a data file (see the next block) in Python
#import urllib
#urllib.urlretrieve("http://home.uchicago.edu/~cmaene/data.zip", filename="data.zip")
## unzip the file - may not work with all OS - skip this part if you've done this already
#os.system('unzip data.zip')

os.chdir('data')
# os.listdir('.') # see what are in there
print "current working directory is:", os.getcwd()

Read spatial data files with GDAL's ogr.

# read spatial objects in CSV - require supplemental/descriptive meta file, VRT
# VRT acts like a supplemental metadata, describes another CSV properties
with open('GPS.vrt', 'r') as vrt:
    print "This is what GPS.vrt looks like:"
    print vrt.read()

datasource=ogr.GetDriverByName('VRT').Open('GPS.vrt')
# because of VRT description, our CSV points know their SRS!
print "Spatial Reference System (SRS) of the data source is\n", datasource.GetLayer().GetSpatialRef()
print "\nTranslate the Well-Known-Text SRS to Proj4! \n", datasource.GetLayer().GetSpatialRef().ExportToProj4()

# optional: conversion from VRT to shapefile - I think GDAL utility, ogr2ogr, is the best/easiest
# os.system('ogr2ogr -skipfailures -overwrite -f "ESRI Shapefile" GPS.shp GPS.vrt')
# plot the GPS points
plt.figure()
# loop thru shaperecords to get x/y from each feature (point/line/polygon)
x,y=[],[]
for feature in datasource.GetLayer():
    geomRef=feature.GetGeometryRef()
    x.append(geomRef.GetX()) # get X from each point and add to the list
    y.append(geomRef.GetY()) # get y from each point and add to the list
plt.plot(x,y, 'ro', label='GPS points') # 'ro' means red o-marker
plt.legend(numpoints=1)
plt.show()

Plot the GPS points with lines and a study area polygon.

plt.figure()

# add the GPS points
gps=ogr.GetDriverByName('ESRI Shapefile').Open('GPS.shp')
for feature in gps.GetLayer():
    geomRef=feature.GetGeometryRef()
    x=[geomRef.GetX(i) for i in range(geomRef.GetPointCount())] #although points has only one object..
    y=[geomRef.GetY(i) for i in range(geomRef.GetPointCount())] #although points has only one object..
    i1,=plt.plot(x,y, 'ro')
    
# read road lines and add the lines - learn the object
roads=ogr.GetDriverByName('ESRI Shapefile').Open('roads.shp')
print "Line : roads"
print "- Geometry type is", roads.GetLayer().GetGeomType()
# OGRwkbGeometryType {
# wkbUnknown = 0, wkbPoint = 1, wkbLineString = 2, wkbPolygon = 3,
# wkbMultiPoint = 4, wkbMultiLineString = 5, wkbMultiPolygon = 6, wkbGeometryCollection = 7,
# wkbCircularString = 8, wkbCompoundCurve = 9, wkbCurvePolygon = 10, wkbMultiCurve = 11,
# wkbMultiSurface = 12, wkbNone = 100, wkbLinearRing = 101, wkbCircularStringZ = 1008, ...
feature0=roads.GetLayer().GetFeature(0)
print "- The feature's geometry name is", feature0.GetGeometryRef().GetGeometryName()
print "- Structure of the 1st feature is", feature0.GetGeometryRef().ExportToWkt() 
print "- Number of points in the 1st feature is", feature0.GetGeometryRef().GetPointCount()
for feature in roads.GetLayer():
    geomRef=feature.GetGeometryRef()
    x=[geomRef.GetX(i) for i in range(geomRef.GetPointCount())]
    y=[geomRef.GetY(i) for i in range(geomRef.GetPointCount())]
    i2,=plt.plot(x,y, 'r--', color='blue')
    
# read studyarea polygon and add the polygon
area=ogr.GetDriverByName('ESRI Shapefile').Open('studyarea.shp')
print "\nPolygon : study area. Feature geometry is complex - compare to the line above"
print "- Geometry type is", area.GetLayer().GetGeomType()
feature0=area.GetLayer().GetFeature(0)
print "- The feature's geometry name is", feature0.GetGeometryRef().GetGeometryRef(0).GetGeometryName()
print "- Structure of the 1st feature is", feature0.GetGeometryRef().ExportToWkt() # notice that polygon points are within extra ()
print "- Number of points in the 1st feature are", feature0.GetGeometryRef().GetGeometryRef(0).GetPointCount(), "(the first and the last points are identical!)"
for feature in area.GetLayer(): 
    geomRef=feature.GetGeometryRef()
    x=[geomRef.GetGeometryRef(0).GetX(i) for i in range(geomRef.GetGeometryRef(0).GetPointCount())]
    y=[geomRef.GetGeometryRef(0).GetY(i) for i in range(geomRef.GetGeometryRef(0).GetPointCount())]
    plt.plot(x,y, '-', color='black')
    i3,=plt.fill(x, y, 'yellow')

# draw the plot
plt.xlabel('Longitude')
plt.ylabel('Latitude')
# take the scientific notation from xy axis tick labels
ax = plt.gca()
ax.get_xaxis().get_major_formatter().set_useOffset(False)
ax.get_yaxis().get_major_formatter().set_useOffset(False)
# add legend items
legend=plt.legend([i1,i2,i3],['GPS points','Roads','Study area'])
legend.get_frame().set_facecolor('white')
plt.show()

Despite having somehow complex data models, ogr has many advantages. Select by attribute is one.. Reference: (1) by attribute filter: https://pcjericks.github.io/py-gdalogr-cookbook/vector_layers.html#filte... , (2) by SQL: http://www.gdal.org/ogr_sql.html, http://gdal.org/python/osgeo.ogr.DataSource-class.html#ExecuteSQL

# census tract mapping -
datasource=ogr.GetDriverByName('ESRI Shapefile').Open('tracts.shp')
# read SRS info
print "Spatial Reference System (SRS) of the data source is:\n", datasource.GetLayer().GetSpatialRef()
print "\nNumber of census tracts are", datasource.GetLayer().GetFeatureCount()

# start a new plot
plt.figure()

# read and plot the census tract shapefile
for feature in datasource.GetLayer():
    geomRef=feature.GetGeometryRef()
    x = [geomRef.GetGeometryRef(0).GetX(i) for i in range(geomRef.GetGeometryRef(0).GetPointCount())]
    y = [geomRef.GetGeometryRef(0).GetY(i) for i in range(geomRef.GetGeometryRef(0).GetPointCount())]
    plt.plot(x,y, '-', color='black')
    i1,=plt.fill(x,y,'white')

# one way to make a selection is to set an attribute filter
datasource.GetLayer().SetAttributeFilter("BLK>1000")
print "Number of Census tracts with BLK>1000 are", datasource.GetLayer().GetFeatureCount()
# clear previous attribute selection filter by passing None
datasource.GetLayer().SetAttributeFilter(None) 
    
# another way is to use SQL
selection=datasource.ExecuteSQL("select * from tracts where BLK>1000")
print "The SQL result is a new OGR layer:", type(selection)

# plot the selected census tracts
for feature in selection:
    geomRef=feature.GetGeometryRef()
    x = [geomRef.GetGeometryRef(0).GetX(i) for i in range(geomRef.GetGeometryRef(0).GetPointCount())]
    y = [geomRef.GetGeometryRef(0).GetY(i) for i in range(geomRef.GetGeometryRef(0).GetPointCount())]
    plt.plot(x,y, '-', color='black')
    i2,=plt.fill(x, y, 'blue')
plt.gca().set_aspect('equal', adjustable='box') # make axes intervals the same
plt.legend([i1,i2],['Tracts','AA>1000'], loc='lower left')
plt.show()

Here we try buffer operation - an OGR function built on GEOS library

# inputs are Chicago Train stations
sts=ogr.GetDriverByName('ESRI Shapefile').Open('CTAbrown.shp')
print "Number of features in the CTAbrown.shp is", sts.GetLayer().GetFeatureCount()

# Let's create a buffer around the stations, will also merge overlapping buffers
stsBufferMerge = ogr.Geometry(ogr.wkbPolygon)  # create an empty polygon
# for each feature/point create a buffer
for feature in sts.GetLayer():
    geomRef=feature.GetGeometryRef()
    stsBuffer=geomRef.Buffer(2640)
    # then merge the new buffer with the merge-buffer-polygon
    stsBufferMerge=stsBufferMerge.Union(stsBuffer)
print "Number of geometries in the buffer is", stsBufferMerge.GetGeometryCount()
print "Total area of the buffer is", format(stsBufferMerge.GetArea(),'.0f'), "square feet."

# plot
plt.figure()
x,y=[],[]
# correct XY of all station points
sts=ogr.GetDriverByName('ESRI Shapefile').Open('CTAbrown.shp')
for feature in sts.GetLayer():
    geomRef=feature.GetGeometryRef()
    x.append(geomRef.GetX()) # get X from each point and add to the list
    y.append(geomRef.GetY()) # get y from each point and add to the list
i1,=plt.plot(x,y, 'ro', color='brown')
# add the merged buffer - no loop, only one geometry
x = [stsBufferMerge.GetGeometryRef(0).GetX(i) for i in range(stsBufferMerge.GetGeometryRef(0).GetPointCount())]
y = [stsBufferMerge.GetGeometryRef(0).GetY(i) for i in range(stsBufferMerge.GetGeometryRef(0).GetPointCount())]
i2,=plt.plot(x,y, '-', color='saddlebrown')
plt.gca().set_aspect('equal', adjustable='box')
plt.legend([i1,i2],['Input','Buffer'], loc='lower left')
plt.show()

Reprojection - changing spatial reference system (SRS). Here, we reproject apartments to save in the same SRS as the buffer. I am using GDAL system utility, ogr2ogr - much easier to achieve than using OGR/OSR python package

# compare SRS of the two shapefiles
sts=ogr.GetDriverByName('ESRI Shapefile').Open('CTAbrown.shp')
print "SRS of CTA stations is:\n", sts.GetLayer().GetSpatialRef().ExportToProj4()
apt=ogr.GetDriverByName('ESRI Shapefile').Open('apartments.shp')
print "\nSRS of the original apartments is:\n", apt.GetLayer().GetSpatialRef().ExportToProj4()

# copy SRS from "CTA stations" in proj4, then apply it to reproject "apartments" which was originally in WGS84
# if we knew EPSG code, we could also use the code: ex. tosrs='EPSG:102671'
tosrs=sts.GetLayer().GetSpatialRef().ExportToProj4()
fmsrs=apt.GetLayer().GetSpatialRef().ExportToProj4()
args='ogr2ogr -overwrite -f "ESRI Shapefile" -t_srs "'+tosrs+'" -s_srs "'+fmsrs+'" apartmentsNEW.shp apartments.shp'
os.system(args)

# use the reproject apartments
apt=ogr.GetDriverByName('ESRI Shapefile').Open('apartmentsNEW.shp')
print "\nSRS of the reprojected apartments is:\n", apt.GetLayer().GetSpatialRef().ExportToProj4()

Another OGR advantage is the ability to select objects by location. We will use the previously merged buffer to select apartments

# plot all apartments
plt.figure()
x,y=[],[]
# correct XY of all station points
apt=ogr.GetDriverByName('ESRI Shapefile').Open('apartmentsNEW.shp')
for feature in apt.GetLayer():
    geomRef=feature.GetGeometryRef()
    x.append(geomRef.GetX()) # get X from each point and add to the list
    y.append(geomRef.GetY()) # get y from each point and add to the list
i1,=plt.plot(x,y, 'bo')

# make a selection by location/buffer zone using a spatial filter
apt.GetLayer().SetSpatialFilter(stsBufferMerge)
print "Total number of selected apartments are:", apt.GetLayer().GetFeatureCount()

# plot selected apartments
x,y=[],[]
# correct XY of all station points
for feature in apt.GetLayer():
    geomRef=feature.GetGeometryRef()
    x.append(geomRef.GetX()) # get X from each point and add to the list
    y.append(geomRef.GetY()) # get y from each point and add to the list
i2,=plt.plot(x,y, 'ro')

# add the merged buffer - no loop, only one geometry
x = [stsBufferMerge.GetGeometryRef(0).GetX(i) for i in range(stsBufferMerge.GetGeometryRef(0).GetPointCount())]
y = [stsBufferMerge.GetGeometryRef(0).GetY(i) for i in range(stsBufferMerge.GetGeometryRef(0).GetPointCount())]
i3,=plt.plot(x,y, '-', color='saddlebrown')

# put all together
plt.gca().set_aspect('equal', adjustable='box')
plt.legend([i1,i2,i3],['Input','Selected','Buffer'], bbox_to_anchor=(1,1), bbox_transform=plt.gcf().transFigure)
plt.show()

Let's calculate the distance to the nearest CTA stations from each apartment using Geopy's vincenty function!

# import geodesic distance calculation function
from geopy.distance import vincenty

# vincenty takes long/lat geographic coordinates only - reproject CTA brown stations
args='ogr2ogr -overwrite -f "ESRI Shapefile" -t_srs EPSG:4326 CTAbrownWGS84.shp CTAbrown.shp'
os.system(args)
sts=ogr.GetDriverByName('ESRI Shapefile').Open('CTAbrownWGS84.shp')
# collect all station IDs & xy coordinates in the two lists
stsId,xy=[],[]
for feature in sts.GetLayer():
    #print feature2.GetField("STATION_ID")
    stsId.append(feature.GetField("STATION_ID"))
    geomSts=feature.GetGeometryRef()
    x=geomSts.GetX()
    y=geomSts.GetY()
    xy.append((y,x))

# print the calculation result
print "\nAPTID PRICE NEAR_STATIONID DISTANCE_MILE"
# for each apartment, calculate distance and identify the nearest station
apt=ogr.GetDriverByName('ESRI Shapefile').Open('apartments.shp')
for feature1 in apt.GetLayer():
    geomApt=feature1.GetGeometryRef()
    x1=geomApt.GetX()
    y1=geomApt.GetY()
    # lambda function lets us iterate vincenty over xy list
    # map() collects and saves the result in the dist list
    dist=map(lambda i:vincenty((y1,x1),i).miles, xy) 
    print feature1.GetField("APTID"),feature1.GetField("PRICE"),stsId[dist.index(min(dist))], format(min(dist),'5.3f')

Optional: GDAL/OGR is not the only python package to read and write spatial objects. PyShp's shapefile library: object model is a lot simpler than OGR, though shapefiles I/O only

# make sure to install PyShp which contain shapefile library
import shapefile

# read GPS point shapefile
gps=shapefile.Reader('GPS.shp')
# PyShp's shapefile: get all records
records=gps.records()
# get the 3rd row/obs and 1-3 variable values
print records[2][0:3]

# plot as simple data objects with points and lines
plt.figure()
for shp in gps.shapeRecords():
    x = [i[0] for i in shp.shape.points[:]]
    y = [i[1] for i in shp.shape.points[:]]
    i1,=plt.plot(x,y, 'ro')
# read road lines and add the lines
roads=shapefile.Reader('roads.shp')
for shp in roads.shapeRecords():
    x = [i[0] for i in shp.shape.points[:]]
    y = [i[1] for i in shp.shape.points[:]]
    i2,=plt.plot(x,y, 'r--', color='blue')
# read studyarea polygon and add the polygon
area=shapefile.Reader('studyarea.shp') 
for shp in area.shapeRecords(): 
    x = [i[0] for i in shp.shape.points[:]] 
    y = [i[1] for i in shp.shape.points[:]]
    plt.plot(x,y, '-', color='green')
    i3,=plt.fill(x, y, 'yellow')
# draw the plot
plt.xlabel('Longitude')
plt.ylabel('Latitude')
# take the scientific notation from xy axis tick labels
ax = plt.gca()
ax.get_xaxis().get_major_formatter().set_useOffset(False)
ax.get_yaxis().get_major_formatter().set_useOffset(False)
# add legend items
legend=plt.legend([i1,i2,i3],['GPS points','Roads','Study area'])
legend.get_frame().set_facecolor('white')
plt.show()

Finally, some raster example - reading and plotting suggestions..

# Read raster data using GDAL
import gdal
raster=gdal.Open('elevation.asc') # it's a simple ASCII text file.

# Learn about the raster data
print "A driver used to open the raster is %s" % raster.GetDriver().GetDescription()
print "There is %d band in the raster." % raster.RasterCount
print "Dimention of the raster is %d and %d" % (raster.RasterXSize, raster.RasterYSize)
# band=raster.GetRasterBand(1) # In GDAL, band numbers start at 1, rather than 0.
# print band.GetMetadata()

# what transformation info is available?
transform=raster.GetGeoTransform()
print "xOrigin     =", format(transform[0],'8.3f')    # upper-left X
print "yOrigin     =", format(transform[3],'8.3f')    # upper-left Y
print "pixelWidth  =", format(transform[1],'8.5f')
print "pixelHeight =", format(transform[5],'8.5f')
xmin,xmax,ymin,ymax=transform[0],transform[0]+transform[1]*raster.RasterXSize,transform[3]+transform[5]*raster.RasterYSize,transform[3]
print "xmin,xmax,ymin,ymax are:", xmin,xmax,ymin,ymax
rarray=raster.ReadAsArray()
print "The lowest elevation value is ", rarray.min()
print "The highest elevation value is ", rarray.max()

# plot image/raster like Matlab/Octave
plt.imshow(rarray,extent=[xmin,xmax,ymin,ymax])
plt.colorbar()

# add the GPS points, as a reference
gps=ogr.GetDriverByName('ESRI Shapefile').Open('GPS.shp')
x,y=[],[]
for feature in gps.GetLayer():
    geomRef=feature.GetGeometryRef()
    x.append(geomRef.GetX()) # get X from each point and add to the list
    y.append(geomRef.GetY()) # get y from each point and add to the list
plt.plot(x,y, 'ro', label='GPS points')
legend=plt.legend(numpoints=1)
legend.get_frame().set_facecolor('white')

# take the scientific notation from xy axis tick labels
ax = plt.gca()
ax.get_xaxis().get_major_formatter().set_useOffset(False)
ax.get_yaxis().get_major_formatter().set_useOffset(False)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

Thank you for reading!

Add new comment

Sign in with your CNETID and password to post a comment, or submit your comment using the form below.

Plain text

  • No HTML tags allowed.
  • Web page addresses and e-mail addresses turn into links automatically.
  • Lines and paragraphs break automatically.