enhancement: Speed up function `recursive_xy_cut_swapped` by 221% (#4173)
<!-- CODEFLASH_OPTIMIZATION:
{"function":"recursive_xy_cut_swapped","file":"unstructured/partition/utils/xycut.py","speedup_pct":"221%","speedup_x":"2.21x","original_runtime":"74.9
milliseconds","best_runtime":"23.4
milliseconds","optimization_type":"loop","timestamp":"2025-12-19T10:16:38.619Z","version":"1.0"}
-->
#### 📄 221% (2.21x) speedup for ***`recursive_xy_cut_swapped` in
`unstructured/partition/utils/xycut.py`***
⏱️ Runtime : **`74.9 milliseconds`** **→** **`23.4 milliseconds`** (best
of `57` runs)
#### 📝 Explanation and details
The optimized code achieves a **220% speedup** by applying **Numba JIT
compilation** to the two most computationally expensive functions:
`projection_by_bboxes` and `split_projection_profile`.
**Key optimizations:**
1. **`@njit(cache=True)` decorators** on both bottleneck functions
compile them to optimized machine code, eliminating Python interpreter
overhead
2. **Explicit loop replacement** in `projection_by_bboxes`: Changed from
`for start, end in boxes[:, axis::2]` with NumPy slice updates to
explicit integer loops accessing individual array elements, which is
much faster in Numba's nopython mode
3. **Manual array construction** in `split_projection_profile`: Replaced
`np.insert()` and `np.append()` with pre-allocated arrays and explicit
assignment loops, avoiding expensive array concatenation operations
**Performance impact analysis:**
From the line profiler results, the optimized functions show dramatic
improvements:
- `projection_by_bboxes` calls went from ~21ms to ~1.17s total runtime
(but this is misleading due to JIT compilation overhead being included)
- The actual per-call performance shows the functions are much faster,
as evidenced by the overall 220% speedup
**Workload benefits:**
Based on the function references and test results, this optimization is
particularly valuable for:
- **Document layout analysis** where `recursive_xy_cut_swapped`
processes many bounding boxes
- **Large-scale scenarios** (500+ boxes) showing 200-240% speedups
consistently
- **Recursive processing** workflows where these functions are called
repeatedly in nested operations
The optimization maintains identical behavior while dramatically
reducing computational overhead for any workload involving spatial
partitioning of bounding boxes, especially beneficial for document
processing pipelines that handle complex layouts with many text regions.
✅ **Correctness verification report:**
| Test | Status |
| --------------------------- | ----------------- |
| ⚙️ Existing Unit Tests | 🔘 **None Found** |
| 🌀 Generated Regression Tests | ✅ **40 Passed** |
| ⏪ Replay Tests | 🔘 **None Found** |
| 🔎 Concolic Coverage Tests | 🔘 **None Found** |
|📊 Tests Coverage | 100.0% |
<details>
<summary>🌀 Generated Regression Tests and Runtime</summary>
```python
# function to test
import numpy as np
# imports
from unstructured.partition.utils.xycut import recursive_xy_cut_swapped
# unit tests
# Basic Test Cases
def test_single_box():
# Test with a single bounding box
boxes = np.array([[0, 0, 10, 10]])
indices = np.array([0])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 44.5μs -> 12.6μs (252% faster)
def test_two_non_overlapping_boxes():
# Two boxes far apart horizontally
boxes = np.array([[0, 0, 10, 10], [20, 0, 30, 10]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 63.7μs -> 19.0μs (235% faster)
def test_two_overlapping_boxes_y():
# Two boxes stacked vertically
boxes = np.array([[0, 0, 10, 10], [0, 20, 10, 30]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 118μs -> 35.5μs (235% faster)
def test_three_boxes_grid():
# Three boxes in a grid
boxes = np.array([[0, 0, 10, 10], [20, 0, 30, 10], [0, 20, 10, 30]])
indices = np.array([0, 1, 2])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 136μs -> 40.9μs (234% faster)
def test_boxes_already_sorted():
# Boxes already sorted by x then y
boxes = np.array([[0, 0, 10, 10], [0, 20, 10, 30], [20, 0, 30, 10]])
indices = np.array([0, 1, 2])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 136μs -> 40.2μs (239% faster)
# Edge Test Cases
def test_boxes_with_zero_area():
# Box with zero width and/or height
boxes = np.array([[0, 0, 0, 10], [10, 10, 20, 10]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 55.6μs -> 36.7μs (51.5% faster)
def test_boxes_with_negative_coordinates():
# Boxes with negative coordinates
boxes = np.array([[-10, -10, 0, 0], [0, 0, 10, 10]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 53.8μs -> 14.7μs (266% faster)
def test_boxes_with_overlap():
# Overlapping boxes
boxes = np.array([[0, 0, 10, 10], [5, 5, 15, 15]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 50.5μs -> 15.0μs (236% faster)
def test_boxes_with_same_coordinates():
# Multiple boxes with same coordinates
boxes = np.array([[0, 0, 10, 10], [0, 0, 10, 10]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 48.2μs -> 13.1μs (268% faster)
def test_boxes_with_minimal_gap():
# Boxes that barely touch (gap = 1)
boxes = np.array([[0, 0, 10, 10], [11, 0, 21, 10]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 66.6μs -> 19.8μs (237% faster)
def test_boxes_with_no_split_possible():
# All boxes overlap so no split
boxes = np.array([[0, 0, 10, 10], [5, 0, 15, 10], [8, 0, 18, 10]])
indices = np.array([0, 1, 2])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 49.7μs -> 13.3μs (272% faster)
# Large Scale Test Cases
def test_large_number_of_boxes_horizontal():
# 500 boxes in a row horizontally
boxes = np.array([[i * 2, 0, i * 2 + 1, 10] for i in range(500)])
indices = np.arange(500)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 10.00ms -> 3.33ms (200% faster)
def test_large_number_of_boxes_vertical():
# 500 boxes in a column vertically
boxes = np.array([[0, i * 2, 10, i * 2 + 1] for i in range(500)])
indices = np.arange(500)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 19.6ms -> 6.31ms (211% faster)
def test_large_grid_of_boxes():
# 20x20 grid of boxes
boxes = []
indices = []
idx = 0
for i in range(20):
for j in range(20):
boxes.append([i * 5, j * 5, i * 5 + 4, j * 5 + 4])
indices.append(idx)
idx += 1
boxes = np.array(boxes)
indices = np.array(indices)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 14.9ms -> 4.36ms (242% faster)
def test_boxes_with_random_order():
# 100 boxes, shuffled
boxes = np.array([[i, i, i + 10, i + 10] for i in range(100)])
indices = np.arange(100)
rng = np.random.default_rng(42)
perm = rng.permutation(100)
boxes = boxes[perm]
indices = indices[perm]
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 223μs -> 22.8μs (880% faster)
def test_boxes_with_dense_overlap():
# 100 boxes all overlapping at the same spot
boxes = np.array([[0, 0, 10, 10] for _ in range(100)])
indices = np.arange(100)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 219μs -> 19.8μs (1011% faster)
# Edge: degenerate case with one pixel boxes
def test_one_pixel_boxes():
boxes = np.array([[i, i, i + 1, i + 1] for i in range(50)])
indices = np.arange(50)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 162μs -> 16.3μs (895% faster)
# Edge: maximal coordinates
def test_boxes_with_max_coordinates():
boxes = np.array([[0, 0, 999, 999], [500, 500, 999, 999]])
indices = np.array([0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 52.3μs -> 21.2μs (147% faster)
# Edge: indices are not in order
def test_indices_not_in_order():
boxes = np.array([[0, 0, 10, 10], [10, 0, 20, 10], [0, 10, 10, 20]])
indices = np.array([2, 0, 1])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 48.5μs -> 13.1μs (271% faster)
# Edge: all boxes touching at one point
def test_boxes_touching_at_one_point():
boxes = np.array([[0, 0, 10, 10], [10, 10, 20, 20], [20, 20, 30, 30]])
indices = np.array([0, 1, 2])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 49.2μs -> 13.2μs (273% faster)
# Edge: very thin boxes
def test_very_thin_boxes():
boxes = np.array([[i, 0, i + 1, 100] for i in range(30)])
indices = np.arange(30)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 106μs -> 16.9μs (531% faster)
# Edge: very flat boxes
def test_very_flat_boxes():
boxes = np.array([[0, i, 100, i + 1] for i in range(30)])
indices = np.arange(30)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 106μs -> 16.5μs (544% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
```python
import numpy as np
# imports
from unstructured.partition.utils.xycut import recursive_xy_cut_swapped
# unit tests
# Basic Test Cases
def test_single_box():
# One box, should return the single index
boxes = np.array([[0, 0, 10, 10]])
indices = np.array([42])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 46.6μs -> 13.2μs (254% faster)
def test_two_non_overlapping_boxes():
# Two boxes, non-overlapping, should return indices sorted by x then y
boxes = np.array(
[
[0, 0, 10, 10], # left box
[20, 0, 30, 10], # right box
]
)
indices = np.array([1, 2])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 65.5μs -> 19.1μs (242% faster)
def test_two_vertically_stacked_boxes():
# Two boxes, stacked vertically, should be sorted by y within x
boxes = np.array(
[
[0, 0, 10, 10], # top box
[0, 20, 10, 30], # bottom box
]
)
indices = np.array([3, 4])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 121μs -> 35.7μs (241% faster)
def test_three_boxes_mixed():
# Boxes in different positions, tests sorting and splitting
boxes = np.array(
[
[0, 0, 10, 10], # top left
[20, 0, 30, 10], # top right
[0, 20, 10, 30], # bottom left
]
)
indices = np.array([10, 11, 12])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 137μs -> 41.3μs (232% faster)
# Edge Test Cases
def test_boxes_with_zero_area():
# Boxes with zero width or height should be ignored
boxes = np.array(
[
[0, 0, 0, 10], # zero width
[10, 10, 20, 10], # zero height
[5, 5, 15, 15], # valid box
]
)
indices = np.array([100, 101, 102])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 71.4μs -> 38.7μs (84.7% faster)
def test_boxes_touching_edges():
# Boxes that touch but do not overlap
boxes = np.array(
[
[0, 0, 10, 10],
[10, 0, 20, 10], # touches right edge of first
[20, 0, 30, 10], # touches right edge of second
]
)
indices = np.array([200, 201, 202])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 53.4μs -> 15.2μs (252% faster)
def test_boxes_with_identical_coordinates():
# Multiple boxes with identical coordinates
boxes = np.array(
[
[0, 0, 10, 10],
[0, 0, 10, 10],
[0, 0, 10, 10],
]
)
indices = np.array([301, 302, 303])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 50.3μs -> 13.5μs (274% faster)
def test_boxes_with_negative_coordinates():
# Boxes with negative coordinates
boxes = np.array(
[
[-10, -10, 0, 0],
[0, 0, 10, 10],
[10, 10, 20, 20],
]
)
indices = np.array([400, 401, 402])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 49.8μs -> 13.5μs (267% faster)
def test_boxes_fully_overlapping():
# All boxes overlap completely
boxes = np.array(
[
[0, 0, 10, 10],
[0, 0, 10, 10],
[0, 0, 10, 10],
]
)
indices = np.array([501, 502, 503])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 48.8μs -> 13.0μs (275% faster)
def test_boxes_with_minimal_gap():
# Boxes separated by minimal gap (just enough to split)
boxes = np.array(
[
[0, 0, 10, 10],
[11, 0, 21, 10], # gap of 1
[22, 0, 32, 10], # gap of 1
]
)
indices = np.array([601, 602, 603])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 86.4μs -> 26.2μs (229% faster)
# Large Scale Test Cases
def test_many_boxes_horizontal():
# 100 boxes in a horizontal row
N = 100
boxes = np.array([[i * 10, 0, i * 10 + 9, 10] for i in range(N)])
indices = np.arange(N)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 1.87ms -> 556μs (236% faster)
def test_many_boxes_vertical():
# 100 boxes in a vertical column
N = 100
boxes = np.array([[0, i * 10, 10, i * 10 + 9] for i in range(N)])
indices = np.arange(N)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 3.78ms -> 1.16ms (225% faster)
def test_grid_of_boxes():
# 10x10 grid of boxes
N = 10
boxes = []
indices = []
idx = 0
for i in range(N):
for j in range(N):
boxes.append([i * 10, j * 10, i * 10 + 9, j * 10 + 9])
indices.append(idx)
idx += 1
boxes = np.array(boxes)
indices = np.array(indices)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 3.86ms -> 1.13ms (242% faster)
# Should be sorted first by x (columns), then by y (rows) within each column
expected = []
for i in range(N):
col_indices = [i * N + j for j in range(N)]
expected.extend(col_indices)
def test_large_random_boxes():
# 500 random boxes, test performance and correctness
np.random.seed(42)
N = 500
left = np.random.randint(0, 1000, size=N)
top = np.random.randint(0, 1000, size=N)
width = np.random.randint(1, 10, size=N)
height = np.random.randint(1, 10, size=N)
right = left + width
bottom = top + height
boxes = np.stack([left, top, right, bottom], axis=1)
indices = np.arange(N)
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 18.2ms -> 5.82ms (212% faster)
def test_boxes_with_max_coordinates():
# Boxes with coordinates at the upper range
boxes = np.array(
[
[990, 990, 999, 999],
[995, 995, 999, 999],
[900, 900, 950, 950],
]
)
indices = np.array([800, 801, 802])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 69.8μs -> 23.5μs (198% faster)
# Additional edge case: test with all boxes in a single point (degenerate case)
def test_boxes_degenerate_point():
boxes = np.array(
[
[5, 5, 5, 5],
[5, 5, 5, 5],
]
)
indices = np.array([900, 901])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 11.2μs -> 4.29μs (160% faster)
# Additional: test with a single tall, thin box and a single short, wide box
def test_tall_and_wide_boxes():
boxes = np.array(
[
[0, 0, 2, 100], # tall, thin
[0, 0, 100, 2], # short, wide
]
)
indices = np.array([1000, 1001])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 47.4μs -> 13.9μs (240% faster)
# Additional: test with overlapping but not identical boxes
def test_overlapping_boxes():
boxes = np.array(
[
[0, 0, 10, 10],
[5, 5, 15, 15],
[10, 10, 20, 20],
]
)
indices = np.array([1100, 1101, 1102])
res = []
recursive_xy_cut_swapped(boxes, indices, res) # 49.1μs -> 13.2μs (273% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
```python
```
</details>
To edit these changes `git checkout
codeflash/optimize-recursive_xy_cut_swapped-mjcpsm6h` and push.
[](https://codeflash.ai)

---------
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>