Skip to content

Commit fbf447f

Browse files
piyushka-allyPiyush Kanti Chanda
andauthored
[GH-2230] Implement GeoSeries.clip_by_rect (#2784)
Co-authored-by: Piyush Kanti Chanda <piyush.chanda@databricks.com>
1 parent fadc953 commit fbf447f

File tree

5 files changed

+131
-0
lines changed

5 files changed

+131
-0
lines changed

python/sedona/spark/geopandas/base.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3073,6 +3073,74 @@ def dwithin(self, other, distance, align=None):
30733073
"""
30743074
return _delegate_to_geometry_column("dwithin", self, other, distance, align)
30753075

3076+
def clip_by_rect(self, xmin, ymin, xmax, ymax):
3077+
"""Returns a ``GeoSeries`` of the portions of geometry within the
3078+
given rectangle.
3079+
3080+
The geometry is clipped to the rectangle defined by the given
3081+
coordinates. Geometries that do not intersect the rectangle are
3082+
returned as empty polygons (``POLYGON EMPTY``).
3083+
3084+
.. note::
3085+
This implementation uses ``ST_Intersection`` with a rectangle
3086+
envelope, which may produce slightly different results from
3087+
geopandas' ``clip_by_rect`` in edge cases:
3088+
3089+
- Non-intersecting geometries are returned as ``POLYGON EMPTY``,
3090+
whereas geopandas returns ``GEOMETRYCOLLECTION EMPTY``.
3091+
- Points on the boundary of the rectangle are considered
3092+
intersecting and are returned unchanged, whereas geopandas
3093+
returns ``GEOMETRYCOLLECTION EMPTY`` for boundary-only
3094+
intersections.
3095+
3096+
Parameters
3097+
----------
3098+
xmin : float
3099+
Minimum x value of the rectangle.
3100+
ymin : float
3101+
Minimum y value of the rectangle.
3102+
xmax : float
3103+
Maximum x value of the rectangle.
3104+
ymax : float
3105+
Maximum y value of the rectangle.
3106+
3107+
Returns
3108+
-------
3109+
GeoSeries
3110+
3111+
Examples
3112+
--------
3113+
>>> from sedona.spark.geopandas import GeoSeries
3114+
>>> from shapely.geometry import Polygon, LineString, Point
3115+
>>> s = GeoSeries(
3116+
... [
3117+
... Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]),
3118+
... LineString([(0, 0), (2, 2)]),
3119+
... Point(0.5, 0.5),
3120+
... ],
3121+
... )
3122+
3123+
>>> s.clip_by_rect(0, 0, 1, 1)
3124+
0 POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))
3125+
1 LINESTRING (0 0, 1 1)
3126+
2 POINT (0.5 0.5)
3127+
dtype: geometry
3128+
3129+
Geometries that do not intersect the rectangle are returned as
3130+
empty:
3131+
3132+
>>> GeoSeries([Point(5, 5)]).clip_by_rect(0, 0, 1, 1)
3133+
0 POLYGON EMPTY
3134+
dtype: geometry
3135+
3136+
See also
3137+
--------
3138+
GeoSeries.intersection
3139+
"""
3140+
return _delegate_to_geometry_column(
3141+
"clip_by_rect", self, xmin, ymin, xmax, ymax
3142+
)
3143+
30763144
def difference(self, other, align=None):
30773145
"""Returns a ``GeoSeries`` of the points in each aligned geometry that
30783146
are not in `other`.

python/sedona/spark/geopandas/geodataframe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"_to_geopandas",
5252
"contains",
5353
"contains_properly",
54+
"clip_by_rect",
5455
"convex_hull",
5556
"count_coordinates",
5657
"count_geometries",

python/sedona/spark/geopandas/geoseries.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
"convex_hull",
6868
"explode",
6969
"clip",
70+
"clip_by_rect",
7071
"from_shapely",
7172
"count_coordinates",
7273
"count_geometries",
@@ -835,6 +836,23 @@ def dwithin(self, other, distance, align=None):
835836
default_val=False,
836837
)
837838

839+
def clip_by_rect(self, xmin, ymin, xmax, ymax) -> "GeoSeries":
840+
if not all(
841+
isinstance(val, (int, float, np.integer, np.floating))
842+
for val in [xmin, ymin, xmax, ymax]
843+
):
844+
raise TypeError(
845+
"clip_by_rect only accepts scalar numeric values for xmin/ymin/xmax/ymax"
846+
)
847+
rect = stc.ST_PolygonFromEnvelope(
848+
float(xmin), float(ymin), float(xmax), float(ymax)
849+
)
850+
spark_expr = stf.ST_Intersection(self.spark.column, rect)
851+
return self._query_geometry_column(
852+
spark_expr,
853+
returns_geom=True,
854+
)
855+
838856
def difference(self, other, align=None) -> "GeoSeries":
839857
other_series, extended = self._make_series_of_val(other)
840858
align = False if extended else align

python/tests/geopandas/test_geoseries.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,36 @@ def test_to_arrow(self):
616616
def test_clip(self):
617617
pass
618618

619+
def test_clip_by_rect(self):
620+
s = GeoSeries(
621+
[
622+
Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]),
623+
LineString([(0, 0), (2, 2)]),
624+
Point(0.5, 0.5),
625+
Point(5, 5),
626+
None,
627+
],
628+
)
629+
result = s.clip_by_rect(0, 0, 1, 1)
630+
expected = gpd.GeoSeries(
631+
[
632+
Polygon([(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)]),
633+
LineString([(0, 0), (1, 1)]),
634+
Point(0.5, 0.5),
635+
Polygon(), # Sedona returns POLYGON EMPTY for non-intersecting
636+
None,
637+
]
638+
)
639+
self.check_sgpd_equals_gpd(result, expected)
640+
641+
# Check that GeoDataFrame works too
642+
df_result = s.to_geoframe().clip_by_rect(0, 0, 1, 1)
643+
self.check_sgpd_equals_gpd(df_result, expected)
644+
645+
# Test invalid input types
646+
with pytest.raises(TypeError):
647+
s.clip_by_rect("a", 0, 1, 1)
648+
619649
def test_geom_type(self):
620650
geoseries = sgpd.GeoSeries(
621651
[

python/tests/geopandas/test_match_geopandas_series.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,20 @@ def test_to_arrow(self):
495495
def test_clip(self):
496496
pass
497497

498+
def test_clip_by_rect(self):
499+
# Use rect (0.3, 0.3, 1.7, 1.7) so no test-geometry vertex or hole
500+
# coordinate (0, 0.1, 0.2, 1, 2, …) lands on a rectangle boundary.
501+
# This avoids boundary-handling differences between JTS and GEOS.
502+
for geom in self.geoms:
503+
# JTS throws TopologyException on invalid geometries (e.g.
504+
# self-intersecting polygons) during ST_Intersection, while
505+
# GEOS handles them gracefully.
506+
if not gpd.GeoSeries(geom).is_valid.all():
507+
continue
508+
sgpd_result = GeoSeries(geom).clip_by_rect(0.3, 0.3, 1.7, 1.7)
509+
gpd_result = gpd.GeoSeries(geom).clip_by_rect(0.3, 0.3, 1.7, 1.7)
510+
self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
511+
498512
def test_geom_type(self):
499513
for geom in self.geoms:
500514
# Sedona converts it to LineString, so the outputs will be different

0 commit comments

Comments
 (0)