enhancement: Speed up method `_PartitionerLoader._load_partitioner` by 266% (#4162)
<!-- CODEFLASH_OPTIMIZATION:
{"function":"_PartitionerLoader._load_partitioner","file":"unstructured/partition/auto.py","speedup_pct":"266%","speedup_x":"2.66x","original_runtime":"2.33
milliseconds","best_runtime":"635
microseconds","optimization_type":"memory","timestamp":"2025-12-20T13:16:17.303Z","version":"1.0"}
-->
#### 📄 266% (2.66x) speedup for
***`_PartitionerLoader._load_partitioner` in
`unstructured/partition/auto.py`***
⏱️ Runtime : **`2.33 milliseconds`** **→** **`635 microseconds`** (best
of `250` runs)
#### 📝 Explanation and details
The optimization adds `@lru_cache(maxsize=128)` to the
`dependency_exists` function, providing **266% speedup** by eliminating
redundant dependency checks.
**Key optimization:** The original code repeatedly calls
`importlib.import_module()` for the same dependency packages during
partition loading. Looking at the line profiler results,
`dependency_exists` was called 659 times and spent 97.9% of its time
(9.33ms out of 9.53ms) in `importlib.import_module()`. The optimized
version reduces this to just 1.27ms total time for dependency checks.
**Why this works:** `importlib.import_module()` is expensive because it
performs filesystem operations, module compilation, and import
resolution. With caching, subsequent calls for the same dependency name
return immediately from memory rather than re-importing. The cache size
of 128 is sufficient for typical use cases where the same few
dependencies are checked repeatedly.
**Performance impact by test case:**
- **Massive gains** for scenarios with many dependencies: The test with
500 dependencies shows **7166% speedup** (1.73ms → 23.9μs)
- **Modest slowdowns** for single-call scenarios: 0-25% slower due to
caching overhead
- **Best suited for:** Applications that load multiple partitioners or
repeatedly validate the same dependencies
**Trade-offs:** Small memory overhead for the cache and slight
performance penalty for first-time dependency checks, but these are
negligible compared to the gains in repeated usage scenarios.
✅ **Correctness verification report:**
| Test | Status |
| --------------------------- | ----------------- |
| ⚙️ Existing Unit Tests | 🔘 **None Found** |
| 🌀 Generated Regression Tests | ✅ **195 Passed** |
| ⏪ Replay Tests | 🔘 **None Found** |
| 🔎 Concolic Coverage Tests | 🔘 **None Found** |
|📊 Tests Coverage | 100.0% |
<details>
<summary>🌀 Generated Regression Tests and Runtime</summary>
```python
from __future__ import annotations
import importlib
import sys
import types
from typing import Callable
# imports
import pytest
from typing_extensions import TypeAlias
from unstructured.partition.auto import _PartitionerLoader
Partitioner: TypeAlias = Callable[..., list]
class DummyElement:
pass
# Dummy FileType class for testing
class FileType:
def __init__(
self,
importable_package_dependencies,
partitioner_function_name,
partitioner_module_qname,
extra_name,
is_partitionable=True,
):
self.importable_package_dependencies = importable_package_dependencies
self.partitioner_function_name = partitioner_function_name
self.partitioner_module_qname = partitioner_module_qname
self.extra_name = extra_name
self.is_partitionable = is_partitionable
# --- Helper functions for test setup ---
def create_fake_module(module_name, func_name, func):
"""Dynamically creates a module and injects it into sys.modules."""
mod = types.ModuleType(module_name)
setattr(mod, func_name, func)
sys.modules[module_name] = mod
return mod
def fake_partitioner(*args, **kwargs):
return [DummyElement()]
# --- Basic Test Cases ---
def test_load_partitioner_basic_success():
"""Test loading a partitioner when all dependencies are present and everything is correct."""
module_name = "test_partitioner_module.basic"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
file_type = FileType(
importable_package_dependencies=[], # No dependencies
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 6.38μs -> 6.08μs (4.80% faster)
def test_load_partitioner_with_single_dependency(monkeypatch):
"""Test loading a partitioner with a single dependency that exists."""
module_name = "test_partitioner_module.singledep"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
# Simulate dependency_exists returns True
monkeypatch.setattr(
"importlib.import_module",
lambda name: types.SimpleNamespace() if name == "somepkg" else sys.modules[module_name],
)
file_type = FileType(
importable_package_dependencies=["somepkg"],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 1.21μs -> 1.62μs (25.7% slower)
def test_load_partitioner_with_multiple_dependencies(monkeypatch):
"""Test loading a partitioner with multiple dependencies that all exist."""
module_name = "test_partitioner_module.multidep"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
# Simulate import_module returns dummy for all dependencies
def import_module_side_effect(name):
if name in ("pkgA", "pkgB"):
return types.SimpleNamespace()
return sys.modules[module_name]
monkeypatch.setattr("importlib.import_module", import_module_side_effect)
file_type = FileType(
importable_package_dependencies=["pkgA", "pkgB"],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 1.42μs -> 1.67μs (14.9% slower)
def test_load_partitioner_returns_correct_function():
"""Test that the returned function is the actual partitioner function from the module."""
module_name = "test_partitioner_module.correct_func"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 7.29μs -> 7.25μs (0.579% faster)
# --- Edge Test Cases ---
def test_load_partitioner_missing_dependency(monkeypatch):
"""Test that ImportError is raised when a dependency is missing."""
module_name = "test_partitioner_module.missingdep"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
# Simulate dependency_exists returns False for missingpkg
original_import_module = importlib.import_module
def import_module_side_effect(name):
if name == "missingpkg":
raise ImportError("No module named 'missingpkg'")
return original_import_module(name)
monkeypatch.setattr("importlib.import_module", import_module_side_effect)
file_type = FileType(
importable_package_dependencies=["missingpkg"],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="missing",
is_partitionable=True,
)
loader = _PartitionerLoader()
with pytest.raises(ImportError) as excinfo:
loader._load_partitioner(file_type) # 2.33μs -> 2.62μs (11.1% slower)
def test_load_partitioner_not_partitionable():
"""Test that an assertion is raised if file_type.is_partitionable is False."""
module_name = "test_partitioner_module.notpartitionable"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=False,
)
loader = _PartitionerLoader()
with pytest.raises(AssertionError):
loader._load_partitioner(file_type) # 541ns -> 542ns (0.185% slower)
def test_load_partitioner_function_not_found():
"""Test that AttributeError is raised if the function is not in the module."""
module_name = "test_partitioner_module.nofunc"
func_name = "partition_func"
# Create module without the function
mod = types.ModuleType(module_name)
sys.modules[module_name] = mod
file_type = FileType(
importable_package_dependencies=[],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=True,
)
loader = _PartitionerLoader()
with pytest.raises(AttributeError):
loader._load_partitioner(file_type) # 8.38μs -> 8.38μs (0.000% faster)
def test_load_partitioner_module_not_found():
"""Test that ModuleNotFoundError is raised if the module does not exist."""
module_name = "test_partitioner_module.doesnotexist"
func_name = "partition_func"
# Do not create the module
file_type = FileType(
importable_package_dependencies=[],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="test",
is_partitionable=True,
)
loader = _PartitionerLoader()
with pytest.raises(ModuleNotFoundError):
loader._load_partitioner(file_type) # 101μs -> 103μs (1.86% slower)
def test_load_partitioner_many_dependencies(monkeypatch):
"""Test loading a partitioner with a large number of dependencies."""
module_name = "test_partitioner_module.large"
func_name = "partition_func"
create_fake_module(module_name, func_name, fake_partitioner)
dep_names = [f"pkg{i}" for i in range(100)]
# Simulate import_module returns dummy for all dependencies
def import_module_side_effect(name):
if name in dep_names:
return types.SimpleNamespace()
return sys.modules[module_name]
monkeypatch.setattr("importlib.import_module", import_module_side_effect)
file_type = FileType(
importable_package_dependencies=dep_names,
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="large",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 45.9μs -> 56.2μs (18.4% slower)
def test_load_partitioner_many_calls(monkeypatch):
"""Test repeated calls to _load_partitioner with different modules and dependencies."""
for i in range(50):
module_name = f"test_partitioner_module.many_{i}"
func_name = f"partition_func_{i}"
def make_func(idx):
return lambda *a, **k: [DummyElement(), idx]
func = make_func(i)
create_fake_module(module_name, func_name, func)
dep_name = f"pkg_{i}"
def import_module_side_effect(name):
if name == dep_name:
return types.SimpleNamespace()
return sys.modules[module_name]
monkeypatch.setattr("importlib.import_module", import_module_side_effect)
file_type = FileType(
importable_package_dependencies=[dep_name],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name=f"many_{i}",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 25.2μs -> 29.3μs (14.2% slower)
def test_load_partitioner_large_function_name():
"""Test loading a partitioner with a very long function name."""
module_name = "test_partitioner_module.longfunc"
func_name = "partition_func_" + "x" * 200
create_fake_module(module_name, func_name, fake_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_function_name=func_name,
partitioner_module_qname=module_name,
extra_name="longfunc",
is_partitionable=True,
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
part_func = codeflash_output # 8.92μs -> 9.17μs (2.73% slower)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
```python
from __future__ import annotations
import importlib
import sys
import types
from typing import Callable
# imports
import pytest
from typing_extensions import TypeAlias
from unstructured.partition.auto import _PartitionerLoader
Partitioner: TypeAlias = Callable[..., list]
class DummyElement:
pass
# Minimal FileType stub for testing
class FileType:
def __init__(
self,
importable_package_dependencies,
partitioner_module_qname,
partitioner_function_name,
extra_name,
is_partitionable=True,
):
self.importable_package_dependencies = importable_package_dependencies
self.partitioner_module_qname = partitioner_module_qname
self.partitioner_function_name = partitioner_function_name
self.extra_name = extra_name
self.is_partitionable = is_partitionable
# --- Test Suite ---
# Helper: create a dummy partitioner function
def dummy_partitioner(*args, **kwargs):
return [DummyElement()]
# Helper: create a dummy module with a partitioner function
def make_dummy_module(mod_name, func_name, func):
mod = types.ModuleType(mod_name)
setattr(mod, func_name, func)
sys.modules[mod_name] = mod
return mod
# Helper: remove dummy module from sys.modules after test
def remove_dummy_module(mod_name):
if mod_name in sys.modules:
del sys.modules[mod_name]
# 1. Basic Test Cases
def test_load_partitioner_success_single_dependency():
"""Should load partitioner when dependency exists and function is present."""
mod_name = "dummy_mod1"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=[], # No dependencies
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
partitioner = codeflash_output # 7.33μs -> 7.38μs (0.556% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_success_multiple_dependencies(monkeypatch):
"""Should load partitioner when all dependencies exist."""
mod_name = "dummy_mod2"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=["sys", "types"],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
partitioner = codeflash_output # 15.3μs -> 15.9μs (3.41% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_dependency_missing(monkeypatch):
"""Should raise ImportError if a dependency is missing."""
mod_name = "dummy_mod3"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=["definitely_not_a_real_package_12345"],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
with pytest.raises(ImportError) as excinfo:
loader._load_partitioner(file_type) # 72.8μs -> 73.4μs (0.851% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_function_missing():
"""Should raise AttributeError if the partitioner function is missing."""
mod_name = "dummy_mod4"
func_name = "not_present_func"
make_dummy_module(mod_name, "some_other_func", dummy_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
with pytest.raises(AttributeError):
loader._load_partitioner(file_type) # 8.12μs -> 8.29μs (2.01% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_module_missing():
"""Should raise ModuleNotFoundError if the partitioner module does not exist."""
mod_name = "definitely_not_a_real_module_12345"
func_name = "partition_func"
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
with pytest.raises(ModuleNotFoundError):
loader._load_partitioner(file_type) # 61.2μs -> 61.3μs (0.271% slower)
def test_load_partitioner_not_partitionable():
"""Should raise AssertionError if file_type.is_partitionable is False."""
mod_name = "dummy_mod5"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
is_partitionable=False,
)
loader = _PartitionerLoader()
with pytest.raises(AssertionError):
loader._load_partitioner(file_type) # 500ns -> 459ns (8.93% faster)
remove_dummy_module(mod_name)
# 2. Edge Test Cases
def test_load_partitioner_empty_function_name():
"""Should raise AttributeError if function name is empty."""
mod_name = "dummy_mod6"
func_name = ""
make_dummy_module(mod_name, "some_func", dummy_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
with pytest.raises(AttributeError):
loader._load_partitioner(file_type) # 8.08μs -> 8.33μs (2.99% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_dependency_name_in_error(monkeypatch):
"""Should only return False if ImportError is for the actual dependency."""
# Patch importlib.import_module to raise ImportError with unrelated message
orig_import_module = importlib.import_module
def fake_import_module(name):
raise ImportError("unrelated error")
monkeypatch.setattr(importlib, "import_module", fake_import_module)
monkeypatch.setattr(importlib, "import_module", orig_import_module)
# 3. Large Scale Test Cases
def test_load_partitioner_many_dependencies(monkeypatch):
"""Should handle a large number of dependencies efficiently."""
# All dependencies are 'sys', which exists
deps = ["sys"] * 500
mod_name = "dummy_mod8"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=deps,
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
partitioner = codeflash_output # 1.73ms -> 23.9μs (7166% faster)
remove_dummy_module(mod_name)
def test_load_partitioner_large_module_name(monkeypatch):
"""Should handle a very long module name (within sys.modules limit)."""
mod_name = "dummy_mod_" + "x" * 200
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
partitioner = codeflash_output # 7.25μs -> 7.67μs (5.43% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_many_calls(monkeypatch):
"""Should remain correct and performant under repeated calls for different modules."""
n = 50
loader = _PartitionerLoader()
for i in range(n):
mod_name = f"dummy_mod_bulk_{i}"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, dummy_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
codeflash_output = loader._load_partitioner(file_type)
partitioner = codeflash_output # 194μs -> 195μs (0.832% slower)
remove_dummy_module(mod_name)
def test_load_partitioner_function_returns_large_list():
"""Should not choke if partitioner returns a large list (scalability)."""
def big_partitioner(*args, **kwargs):
return [DummyElement() for _ in range(900)]
mod_name = "dummy_mod9"
func_name = "partition_func"
make_dummy_module(mod_name, func_name, big_partitioner)
file_type = FileType(
importable_package_dependencies=[],
partitioner_module_qname=mod_name,
partitioner_function_name=func_name,
extra_name="dummy",
)
loader = _PartitionerLoader()
codeflash_output = loader._load_partitioner(file_type)
partitioner = codeflash_output # 7.04μs -> 6.88μs (2.41% faster)
result = partitioner()
remove_dummy_module(mod_name)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
</details>
To edit these changes `git checkout
codeflash/optimize-_PartitionerLoader._load_partitioner-mjebngyb` and
push.
[](https://codeflash.ai)

---------
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
Co-authored-by: Aseem Saxena <aseem.bits@gmail.com>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>