enhancement: Speed up function `_get_optimal_value_for_bbox` by 2,883% (#4181)
<!-- CODEFLASH_OPTIMIZATION:
{"function":"_get_optimal_value_for_bbox","file":"unstructured/partition/pdf_image/analysis/bbox_visualisation.py","speedup_pct":"2,883%","speedup_x":"28.83x","original_runtime":"11.9
milliseconds","best_runtime":"398
microseconds","optimization_type":"algorithm","timestamp":"2025-12-20T00:54:57.031Z","version":"1.0"}
-->
#### 📄 2,883% (28.83x) speedup for ***`_get_optimal_value_for_bbox` in
`unstructured/partition/pdf_image/analysis/bbox_visualisation.py`***
⏱️ Runtime : **`11.9 milliseconds`** **→** **`398 microseconds`** (best
of `250` runs)
#### 📝 Explanation and details
The optimized code achieves a **2882% speedup** by applying two key
optimizations:
**1. Numba JIT Compilation:**
Added `@njit(cache=True, fastmath=True)` decorators to
`_get_bbox_to_page_ratio` and the new `_linear_polyfit_2point`
functions. Numba compiles these Python functions to machine code,
eliminating interpreter overhead and providing near-C performance for
numerical computations.
**2. Replaced NumPy's General-Purpose Linear Regression:**
The original code used `np.polyfit()` for simple 2-point linear
interpolation, which is overkill and involves significant overhead. The
optimization replaces this with a custom `_linear_polyfit_2point`
function that directly computes slope and intercept using basic
arithmetic: `slope = (y1-y0)/(x1-x0)` and `intercept = y0 - slope*x0`.
This eliminates the overhead of NumPy's general polynomial fitting
algorithm.
**Performance Impact:**
From the line profiler results, the original `np.polyfit` call consumed
86.5% of execution time (24.7ms out of 28.6ms total). The optimized
version reduces this to just 15.2% of a much smaller total runtime. The
first call to each JIT-compiled function includes compilation overhead,
but subsequent calls benefit from cached machine code.
**Real-World Benefits:**
Based on function references, `_get_optimal_value_for_bbox` is called by
`get_bbox_text_size` and `get_bbox_thickness` for PDF visualization.
These functions likely process many bounding boxes during document
analysis, making the 20x+ speedup significant for document processing
pipelines.
**Test Case Performance:**
The optimizations excel across all test scenarios, showing 15-30x
speedups for individual calls and even higher gains (30x) for bulk
processing tests with many bounding boxes, demonstrating the value of
JIT compilation for repeated computational workloads.
✅ **Correctness verification report:**
| Test | Status |
| --------------------------- | ----------------- |
| ⚙️ Existing Unit Tests | 🔘 **None Found** |
| 🌀 Generated Regression Tests | ✅ **722 Passed** |
| ⏪ Replay Tests | 🔘 **None Found** |
| 🔎 Concolic Coverage Tests | 🔘 **None Found** |
|📊 Tests Coverage | 100.0% |
<details>
<summary>🌀 Generated Regression Tests and Runtime</summary>
```python
import math
import numpy as np
# imports
from unstructured.partition.pdf_image.analysis.bbox_visualisation import _get_optimal_value_for_bbox
# unit tests
# -------- BASIC TEST CASES --------
def test_min_ratio_returns_min_value():
"""Test: bbox with ratio exactly at ratio_for_min_value returns min_value."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
# Find bbox with diagonal such that bbox_diagonal/page_diagonal = ratio_for_min_value
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * ratio_for_min_value
# Make a square bbox at origin with this diagonal
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 24.8μs -> 1.17μs (2024% faster)
def test_max_ratio_returns_max_value():
"""Test: bbox with ratio exactly at ratio_for_max_value returns max_value."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * ratio_for_max_value
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 23.8μs -> 1.08μs (2093% faster)
def test_mid_ratio_returns_mid_value():
"""Test: bbox with ratio halfway between min and max returns value halfway between min_value and max_value."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
mid_ratio = (ratio_for_min_value + ratio_for_max_value) / 2
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * mid_ratio
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 23.5μs -> 1.08μs (2066% faster)
# The function does int() flooring, so result should be the floored midpoint
expected = int((min_value + max_value) / 2)
def test_linear_increase_between_min_and_max():
"""Test: As bbox size increases, value increases linearly between min and max."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
# Test a few ratios in between
for frac in [0.2, 0.3, 0.4]:
ratio = ratio_for_min_value + frac * (ratio_for_max_value - ratio_for_min_value)
bbox_diag = page_diag * ratio
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 57.4μs -> 2.87μs (1898% faster)
# Compute expected value using the same linear fit
coefficients = np.polyfit(
(ratio_for_min_value, ratio_for_max_value), (min_value, max_value), 1
)
expected = int(ratio * coefficients[0] + coefficients[1])
# -------- EDGE TEST CASES --------
def test_bbox_smaller_than_min_ratio_clamps_to_min():
"""Test: bbox smaller than ratio_for_min_value returns min_value (clamping)."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * (ratio_for_min_value / 2)
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 23.2μs -> 1.08μs (2043% faster)
def test_bbox_larger_than_max_ratio_clamps_to_max():
"""Test: bbox larger than ratio_for_max_value returns max_value (clamping)."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * (ratio_for_max_value * 1.2)
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 23.3μs -> 1.04μs (2135% faster)
def test_min_value_equals_max_value():
"""Test: If min_value == max_value, always returns that value."""
page_size = (1000, 1000)
min_value = max_value = 42
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
# Try a variety of bbox sizes
for ratio in [0.01, 0.1, 0.25, 0.5, 0.75]:
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * ratio
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 92.7μs -> 3.42μs (2613% faster)
def test_zero_sized_bbox():
"""Test: Zero-sized bbox returns min_value."""
page_size = (1000, 1000)
min_value = 5
max_value = 10
bbox = (0, 0, 0, 0)
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 23.5μs -> 1.17μs (1915% faster)
def test_non_square_page_and_bbox():
"""Test: non-square page and bbox are handled correctly."""
page_size = (800, 1200)
min_value = 3
max_value = 9
ratio_for_min_value = 0.05
ratio_for_max_value = 0.3
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * ratio_for_min_value
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 33.5μs -> 1.67μs (1908% faster)
def test_negative_bbox_coordinates():
"""Test: bbox with negative coordinates is handled (should be treated as normal)."""
page_size = (1000, 1000)
min_value = 1
max_value = 100
bbox = (-10, -10, 100, 100)
# The diagonal is the same as (0,0,110,110)
expected_diag = math.sqrt((100 - (-10)) ** 2 + (100 - (-10)) ** 2)
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
expected_ratio = expected_diag / page_diag
# Compute expected value using the same linear fit
coefficients = np.polyfit((0.01, 0.5), (min_value, max_value), 1)
expected = int(expected_ratio * coefficients[0] + coefficients[1])
expected = max(min_value, min(max_value, expected))
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 21.3μs -> 1.33μs (1496% faster)
def test_min_ratio_greater_than_max_ratio():
"""Test: If ratio_for_min_value > ratio_for_max_value, the function still works (reversed)."""
page_size = (1000, 1000)
min_value = 10
max_value = 50
ratio_for_min_value = 0.5
ratio_for_max_value = 0.01
# Use a ratio between the two
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
mid_ratio = (ratio_for_min_value + ratio_for_max_value) / 2
bbox_diag = page_diag * mid_ratio
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 24.8μs -> 1.17μs (2023% faster)
def test_non_integer_bbox_and_page_size():
"""Test: bbox and page_size with float values are handled (should work as ints)."""
page_size = (1000.0, 1000.0)
min_value = 10
max_value = 20
ratio_for_min_value = 0.1
ratio_for_max_value = 0.2
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * ratio_for_min_value
side = bbox_diag / math.sqrt(2)
bbox = (0.0, 0.0, side, side)
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 23.8μs -> 1.46μs (1528% faster)
# -------- LARGE SCALE TEST CASES --------
def test_many_bboxes_linear_increase():
"""Test: For many bboxes of increasing size, the output increases monotonically."""
page_size = (1000, 1000)
min_value = 5
max_value = 25
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
prev_result = None
# Test 100 bboxes from min_ratio to max_ratio
for i in range(100):
ratio = ratio_for_min_value + i * (ratio_for_max_value - ratio_for_min_value) / 99
bbox_diag = page_diag * ratio
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 1.61ms -> 51.3μs (3030% faster)
if prev_result is not None:
pass
prev_result = result
def test_large_page_and_bbox_sizes():
"""Test: Handles very large page and bbox sizes without overflow."""
page_size = (10**6, 10**6)
min_value = 1
max_value = 1000
ratio_for_min_value = 0.01
ratio_for_max_value = 0.5
page_diag = math.sqrt(page_size[0] ** 2 + page_size[1] ** 2)
bbox_diag = page_diag * ratio_for_max_value
side = bbox_diag / math.sqrt(2)
bbox = (0, 0, int(side), int(side))
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, ratio_for_min_value, ratio_for_max_value
)
result = codeflash_output # 23.4μs -> 1.08μs (2062% faster)
def test_many_random_bboxes():
"""Test: Handles 500 random bboxes and returns values in [min_value, max_value]."""
import random
page_size = (1000, 1000)
min_value = 10
max_value = 100
for _ in range(500):
x1 = random.randint(0, 900)
y1 = random.randint(0, 900)
x2 = random.randint(x1, 1000)
y2 = random.randint(y1, 1000)
bbox = (x1, y1, x2, y2)
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 7.98ms -> 259μs (2981% faster)
```
```python
# imports
from unstructured.partition.pdf_image.analysis.bbox_visualisation import _get_optimal_value_for_bbox
# unit tests
# BASIC TESTS
def test_basic_small_bbox():
"""Test with a small bbox, expect min_value returned."""
bbox = (0, 0, 10, 10)
page_size = (1000, 1000)
min_value = 2
max_value = 10
# Small bbox so ratio is very small, should return min_value
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 34.8μs -> 1.79μs (1845% faster)
def test_basic_large_bbox():
"""Test with a large bbox, expect max_value returned."""
bbox = (0, 0, 900, 900)
page_size = (1000, 1000)
min_value = 2
max_value = 10
# Large bbox so ratio is large, should return max_value
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 28.1μs -> 1.17μs (2310% faster)
def test_basic_mid_bbox():
"""Test with a mid-sized bbox, expect value between min and max."""
bbox = (0, 0, 500, 500)
page_size = (1000, 1000)
min_value = 2
max_value = 10
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 26.3μs -> 1.17μs (2155% faster)
def test_basic_non_square_bbox_and_page():
"""Test with non-square bbox and page."""
bbox = (10, 20, 60, 120)
page_size = (200, 400)
min_value = 1
max_value = 5
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 25.4μs -> 1.12μs (2156% faster)
def test_basic_min_equals_max():
"""Test when min_value == max_value, always return min_value/max_value."""
bbox = (0, 0, 100, 100)
page_size = (1000, 1000)
min_value = max_value = 7
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 25.0μs -> 1.17μs (2048% faster)
# EDGE CASES
def test_edge_bbox_at_origin():
"""Test bbox at the origin (0,0,0,0), should return min_value."""
bbox = (0, 0, 0, 0)
page_size = (1000, 1000)
min_value = 1
max_value = 9
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 24.9μs -> 1.12μs (2115% faster)
def test_edge_bbox_equals_page():
"""Test bbox same as page, should return max_value."""
page_size = (1000, 1000)
bbox = (0, 0, 1000, 1000)
min_value = 2
max_value = 8
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 24.6μs -> 1.12μs (2085% faster)
def test_edge_negative_coordinates():
"""Test bbox with negative coordinates."""
bbox = (-10, -10, 10, 10)
page_size = (100, 100)
min_value = 1
max_value = 5
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 24.3μs -> 1.12μs (2063% faster)
def test_edge_min_greater_than_max():
"""Test with min_value > max_value, should always return min_value (clamped)."""
bbox = (0, 0, 500, 500)
page_size = (1000, 1000)
min_value = 10
max_value = 5
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 24.1μs -> 1.12μs (2044% faster)
def test_edge_ratio_for_min_equals_max():
"""Test with ratio_for_min_value == ratio_for_max_value, should handle gracefully."""
bbox = (0, 0, 100, 100)
page_size = (1000, 1000)
min_value = 2
max_value = 10
# Both ratios are the same, polyfit will warn but should not crash
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value, 0.1, 0.1)
result = codeflash_output # 28.5μs -> 1.12μs (2433% faster)
def test_edge_bbox_width_or_height_zero():
"""Test bbox with zero width or height."""
bbox = (10, 10, 10, 100) # width zero
page_size = (100, 100)
min_value = 1
max_value = 4
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 24.0μs -> 1.12μs (2037% faster)
bbox = (10, 10, 100, 10) # height zero
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result2 = codeflash_output # 19.1μs -> 666ns (2765% faster)
def test_edge_small_page_large_bbox():
"""Test with a very small page and a large bbox."""
bbox = (0, 0, 100, 100)
page_size = (1, 1)
min_value = 1
max_value = 20
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 24.0μs -> 1.21μs (1883% faster)
def test_edge_large_page_small_bbox():
"""Test with a very large page and a small bbox."""
bbox = (0, 0, 1, 1)
page_size = (10000, 10000)
min_value = 1
max_value = 20
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 23.8μs -> 1.12μs (2018% faster)
def test_edge_float_values_for_ratios():
"""Test with custom float values for ratio_for_min_value and ratio_for_max_value."""
bbox = (0, 0, 500, 500)
page_size = (1000, 1000)
min_value = 1
max_value = 10
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, 0.05, 0.25
)
result = codeflash_output # 23.8μs -> 1.08μs (2097% faster)
def test_edge_non_integer_values():
"""Test with non-integer min/max values (should be int, but check type safety)."""
bbox = (0, 0, 100, 100)
page_size = (1000, 1000)
# Should work, but output is always int
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, 2.7, 9.3)
result = codeflash_output # 23.7μs -> 1.46μs (1526% faster)
# LARGE SCALE TESTS
def test_large_scale_many_bboxes():
"""Test with many bboxes and pages to check performance and correctness."""
page_size = (1000, 1000)
min_value = 1
max_value = 20
# 1000 bboxes, increasing size
for i in range(1, 1001, 100):
bbox = (0, 0, i, i)
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 176μs -> 6.16μs (2762% faster)
def test_large_scale_maximum_bbox():
"""Test with bbox at the maximum allowed by a large page."""
page_size = (9999, 9999)
bbox = (0, 0, 9999, 9999)
min_value = 1
max_value = 100
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 23.5μs -> 1.12μs (1989% faster)
def test_large_scale_random_bboxes():
"""Test with random bboxes and page sizes (deterministic with fixed seed)."""
import random
random.seed(42)
min_value = 1
max_value = 50
for _ in range(50):
w = random.randint(100, 1000)
h = random.randint(100, 1000)
x1 = random.randint(0, w // 2)
y1 = random.randint(0, h // 2)
x2 = random.randint(x1, w)
y2 = random.randint(y1, h)
bbox = (x1, y1, x2, y2)
page_size = (w, h)
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output # 816μs -> 27.1μs (2919% faster)
def test_large_scale_extreme_ratios():
"""Test with extreme ratio_for_min_value and ratio_for_max_value."""
bbox = (0, 0, 500, 500)
page_size = (1000, 1000)
min_value = 1
max_value = 100
# Very small ratio for min, very large for max
codeflash_output = _get_optimal_value_for_bbox(
bbox, page_size, min_value, max_value, 0.0001, 0.9999
)
result = codeflash_output # 23.4μs -> 1.08μs (2056% faster)
def test_large_scale_dense_grid():
"""Test a dense grid of bboxes over the page."""
page_size = (1000, 1000)
min_value = 1
max_value = 10
step = 200
for x1 in range(0, 801, step):
for y1 in range(0, 801, step):
x2 = x1 + step
y2 = y1 + step
bbox = (x1, y1, min(x2, 1000), min(y2, 1000))
codeflash_output = _get_optimal_value_for_bbox(bbox, page_size, min_value, max_value)
result = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
</details>
To edit these changes `git checkout
codeflash/optimize-_get_optimal_value_for_bbox-mjdl64kx` and push.
[](https://codeflash.ai)

---------
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>