Geographic Information Systems Asked by RedM on December 15, 2020
I’ve got 2 geodataframes:
import geopandas as gpd
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John',1,Point(1,1)],['Smith',1,Point(2,2)],['Soap',1,Point(0,2)]],columns=['Name','ID','geometry'])
gpd2 = gpd.GeoDataFrame([['Work',Point(0,1.1)],['Shops',Point(2.5,2)],['Home',Point(1,1.1)]],columns=['Place','geometry'])
and I want to find the name of the nearest point in gpd2 for each row in gpd1:
desired_output =
Name ID geometry Nearest
0 John 1 POINT (1 1) Home
1 Smith 1 POINT (2 2) Shops
2 Soap 1 POINT (0 2) Work
I’ve been trying to get this working using a lambda function:
gpd1['Nearest'] = gpd1.apply(lambda row: min_dist(row.geometry,gpd2)['Place'] , axis=1)
with
def min_dist(point, gpd2):
geoseries = some_function()
return geoseries
You can directly use the Shapely function Nearest points (the geometries of the GeoSeries are Shapely geometries):
from shapely.ops import nearest_points
# unary union of the gpd2 geomtries
pts3 = gpd2.geometry.unary_union
def near(point, pts=pts3):
# find the nearest point and return the corresponding Place value
nearest = gpd2.geometry == nearest_points(point, pts)[1]
return gpd2[nearest].Place.get_values()[0]
gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)
gpd1
Name ID geometry Nearest
0 John 1 POINT (1 1) Home
1 Smith 1 POINT (2 2) Shops
2 Soap 1 POINT (0 2) Work
Explication
for i, row in gpd1.iterrows():
print nearest_points(row.geometry, pts3)[0], nearest_points(row.geometry, pts3)[1]
POINT (1 1) POINT (1 1.1)
POINT (2 2) POINT (2.5 2)
POINT (0 2) POINT (0 1.1)
Correct answer by gene on December 15, 2020
Figured it out:
def min_dist(point, gpd2):
gpd2['Dist'] = gpd2.apply(lambda row: point.distance(row.geometry),axis=1)
geoseries = gpd2.iloc[gpd2['Dist'].argmin()]
return geoseries
Of course some criticism is welcome. I'm not a fan of recalculating gpd2['Dist'] for every row of gpd1...
Answered by RedM on December 15, 2020
If you have large dataframes, I've found that scipy
's cKDTree spatial index .query
method returns very fast results for nearest neighbor searches. As it uses a spatial index it's orders of magnitude faster than looping though the dataframe and then finding the minimum of all distances. It is also faster than using shapely's nearest_points
with RTree (the spatial index method available via geopandas) because cKDTree allows you to vectorize your search whereas the other method does not.
Here is a helper function that will return the distance and 'Name' of the nearest neighbor in gpd2
from each point in gpd1
. It assumes both gdfs have a geometry
column (of points).
import geopandas as gpd
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)], ['Smith', 1, Point(2, 2)],
['Soap', 1, Point(0, 2)]],
columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', Point(0, 1.1)], ['Shops', Point(2.5, 2)],
['Home', Point(1, 1.1)]],
columns=['Place', 'geometry'])
def ckdnearest(gdA, gdB):
nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
btree = cKDTree(nB)
dist, idx = btree.query(nA, k=1)
gdf = pd.concat(
[gdA.reset_index(drop=True), gdB.loc[idx, gdB.columns != 'geometry'].reset_index(drop=True),
pd.Series(dist, name='dist')], axis=1)
return gdf
ckdnearest(gpd1, gpd2)
And if you want to find the closest point to a LineString, here is a full working example:
import itertools
from operator import itemgetter
import geopandas as gpd
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString
gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
['Smith', 1, Point(2, 2)],
['Soap', 1, Point(0, 2)]],
columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
['Home', LineString([Point(101, 0), Point(102, 1)])]],
columns=['Place', 'geometry'])
def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
A = np.concatenate(
[np.array(geom.coords) for geom in gdfA.geometry.to_list()])
B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
B_ix = tuple(itertools.chain.from_iterable(
[itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
B = np.concatenate(B)
ckd_tree = cKDTree(B)
dist, idx = ckd_tree.query(A, k=1)
idx = itemgetter(*idx)(B_ix)
gdf = pd.concat(
[gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
pd.Series(dist, name='dist')], axis=1)
return gdf
c = ckdnearest(gpd1, gpd2)
Answered by JHuw on December 15, 2020
The answer by Gene didn't work for me. Finally I discovered that gpd2.geometry.unary_union resulted in a geometry that only contained about 30.000 of my total of roughly 150.000 points. For anyone else running into the same problem, here's how I solved it:
from shapely.ops import nearest_points
from shapely.geometry import MultiPoint
gpd2_pts_list = gpd2.geometry.tolist()
gpd2_pts = MultiPoint(gpd2_pts_list)
def nearest(point, gpd2_pts, gpd2=gpd2, geom_col='geometry', src_col='Place'):
# find the nearest point
nearest_point = nearest_points(point, gpd2_pts)[1]
# return the corresponding value of the src_col of the nearest point
value = gpd2[gpd2[geom_col] == nearest_point][src_col].get_values()[0]
return value
gpd1['Nearest'] = gpd1.apply(lambda x: nearest(x.geometry, gpd2_pts), axis=1)
Answered by Inske on December 15, 2020
For anyone having indexing errors with their own data while using the excellent answer from @JHuw, my problem was that my indexes did not align. Resetting the index of gdfA and gdfB solved my issues, maybe this can help you as well @Shakedk.
import itertools
from operator import itemgetter
import geopandas as gpd
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString
gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
['Smith', 1, Point(2, 2)],
['Soap', 1, Point(0, 2)]],
columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
['Home', LineString([Point(101, 0), Point(102, 1)])]],
columns=['Place', 'geometry'])
def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
# resetting the index of gdfA and gdfB here.
gdfA = gdfA.reset_index(drop=True)
gdfB = gdfB.reset_index(drop=True)
A = np.concatenate(
[np.array(geom.coords) for geom in gdfA.geometry.to_list()])
B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
B_ix = tuple(itertools.chain.from_iterable(
[itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
B = np.concatenate(B)
ckd_tree = cKDTree(B)
dist, idx = ckd_tree.query(A, k=1)
idx = itemgetter(*idx)(B_ix)
gdf = pd.concat(
[gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
pd.Series(dist, name='dist')], axis=1)
return gdf
c = ckdnearest(gpd1, gpd2)
Answered by Markus Rosenfelder on December 15, 2020
This solution is extremely inefficient but it should work for any and all geometry types (including mixed geometry type gdfs). I would only try this if your gdfs are small (my use case was a gdf with about 2000 rows for which I wanted to find the nearest feature from another gdf with about 15 rows, and it took a few seconds on a typical office laptop). Intersecting features may cause it to spaz out though, so be warned. It was originally based on @RedM's solution but will instead assign the index of the feature in gdf2
that is nearest to gdf1
gdf1["gdf2_idx"] = gdf1.apply(
lambda row1: gdf2.apply(
lambda row2: row1.geometry.distance(row2.geometry), axis="columns").idxmin(),
axis="columns"
)
Answered by wfgeo on December 15, 2020
Get help from others!
Recent Answers
Recent Questions
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP