Skip to content

Commit d6a4e39

Browse files
authored
Merge pull request #4829 from kemo/perf/formula-ast-parsing
Add formula token cache to reduce repeated parsing overhead
2 parents 3158be0 + e02c791 commit d6a4e39

4 files changed

Lines changed: 475 additions & 2 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org). Thia is a
99

1010
### Added
1111

12-
- Nothing yet.
12+
- Optional method to increase Calculation Engine's parsing speed. [PR #4829](https://github.com/PHPOffice/PhpSpreadsheet/pull/4829)
1313

1414
### Removed
1515

src/PhpSpreadsheet/Calculation/Calculation.php

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,19 @@ class Calculation extends CalculationLocale
9090
*/
9191
private bool $calculationCacheEnabled = true;
9292

93+
/**
94+
* Maximum number of entries in the formula token cache.
95+
* Default 0 (disabled). Set via setFormulaTokenCacheMaxSize() to enable.
96+
*/
97+
private int $formulaTokenCacheMaxSize = 0;
98+
99+
/**
100+
* Cache of parsed formula tokens, keyed by the raw formula string.
101+
*
102+
* @var array<string, array<mixed>|bool>
103+
*/
104+
private array $formulaTokenCache = [];
105+
93106
private BranchPruner $branchPruner;
94107

95108
protected bool $branchPruningEnabled = true;
@@ -241,6 +254,7 @@ public function flushInstance(): void
241254
{
242255
$this->clearCalculationCache();
243256
$this->branchPruner->clearBranchStore();
257+
$this->formulaTokenCache = [];
244258
}
245259

246260
/**
@@ -366,6 +380,44 @@ public function clearCalculationCache(): void
366380
$this->calculationCache = [];
367381
}
368382

383+
/**
384+
* Clear the formula token cache.
385+
*/
386+
public function clearFormulaTokenCache(): void
387+
{
388+
$this->formulaTokenCache = [];
389+
}
390+
391+
/**
392+
* Get the current number of entries in the formula token cache.
393+
*/
394+
public function getFormulaTokenCacheSize(): int
395+
{
396+
return count($this->formulaTokenCache);
397+
}
398+
399+
/**
400+
* Set the maximum number of entries in the formula token cache.
401+
* Set to 0 to disable caching (default), or a positive integer to enable.
402+
*/
403+
public function setFormulaTokenCacheMaxSize(int $size): self
404+
{
405+
$this->formulaTokenCacheMaxSize = max(0, $size);
406+
if ($this->formulaTokenCacheMaxSize === 0) {
407+
$this->formulaTokenCache = [];
408+
}
409+
410+
return $this;
411+
}
412+
413+
/**
414+
* Get the maximum number of entries allowed in the formula token cache.
415+
*/
416+
public function getFormulaTokenCacheMaxSize(): int
417+
{
418+
return $this->formulaTokenCacheMaxSize;
419+
}
420+
369421
/**
370422
* Clear calculation cache for a specified worksheet.
371423
*/
@@ -559,6 +611,12 @@ public function calculateCellValue(?Cell $cell = null, bool $resetLog = true): m
559611
*/
560612
public function parseFormula(string $formula): array|bool
561613
{
614+
// Check the formula token cache first (only when caching is enabled)
615+
if ($this->formulaTokenCacheMaxSize > 0 && isset($this->formulaTokenCache[$formula])) {
616+
return $this->formulaTokenCache[$formula];
617+
}
618+
619+
$originalFormula = $formula;
562620
$formula = Preg::replaceCallback(
563621
self::CALCULATION_REGEXP_CELLREF_SPILL,
564622
fn (array $matches) => 'ANCHORARRAY(' . substr($matches[0], 0, -1) . ')',
@@ -576,7 +634,19 @@ public function parseFormula(string $formula): array|bool
576634
}
577635

578636
// Parse the formula and return the token stack
579-
return $this->internalParseFormula($formula);
637+
$result = $this->internalParseFormula($formula);
638+
639+
// Cache the result when caching is enabled (clear cache if it exceeds the maximum size)
640+
if ($this->formulaTokenCacheMaxSize > 0) {
641+
if (count($this->formulaTokenCache) >= $this->formulaTokenCacheMaxSize) {
642+
$this->formulaTokenCache = [];
643+
}
644+
// Cache key is the original formula string (before ANCHORARRAY transformation)
645+
// to ensure consistent lookup regardless of internal transformations.
646+
$this->formulaTokenCache[$originalFormula] = $result;
647+
}
648+
649+
return $result;
580650
}
581651

582652
/**
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpOffice\PhpSpreadsheetBenchmarks;
6+
7+
use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
8+
use PhpOffice\PhpSpreadsheet\Spreadsheet;
9+
use PHPUnit\Framework\TestCase;
10+
11+
/**
12+
* Benchmark tests for the formula token cache in the Calculation engine.
13+
*
14+
* These tests demonstrate the performance benefit of caching parsed formula
15+
* tokens so that identical formula strings are not re-parsed on every evaluation.
16+
*
17+
* Run with: vendor/bin/phpunit --testsuite Benchmark --filter FormulaTokenCacheBenchmark --stderr
18+
*/
19+
#[\PHPUnit\Framework\Attributes\Group('benchmark')]
20+
class FormulaTokenCacheBenchmarkTest extends TestCase
21+
{
22+
private const CACHE_SIZE = 1000;
23+
24+
/** Formula patterns used across benchmarks. */
25+
private const FORMULA_PATTERNS = [
26+
'=A%d+B%d',
27+
'=SUM(A%d:B%d)',
28+
'=IF(A%d>0,B%d,C%d)',
29+
'=AVERAGE(A%d:D%d)',
30+
'=A%d*B%d+C%d',
31+
'=MAX(A%d,B%d,C%d)',
32+
'=MIN(A%d:C%d)/D%d',
33+
'=CONCATENATE(A%d,"-",B%d)',
34+
'=ROUND(A%d/B%d,2)',
35+
'=IFERROR(A%d/B%d,0)',
36+
];
37+
38+
/**
39+
* Benchmark: parse 1000 spreadsheet-like formulas with cache enabled vs disabled.
40+
*/
41+
public function testParseFormulaCacheEnabledVsDisabled(): void
42+
{
43+
$spreadsheet = new Spreadsheet();
44+
$calculation = Calculation::getInstance($spreadsheet);
45+
$cellCount = 1000;
46+
47+
// Build a realistic set of formulas that a spreadsheet might contain
48+
$patternCount = count(self::FORMULA_PATTERNS);
49+
$formulas = [];
50+
for ($row = 1; $row <= $cellCount; ++$row) {
51+
$pattern = self::FORMULA_PATTERNS[$row % $patternCount];
52+
$refRow = (($row - 1) % 100) + 1;
53+
$formulas[] = $this->buildFormula($pattern, $refRow);
54+
}
55+
56+
// --- Run 1: Cache disabled (default) ---
57+
$calculation->setFormulaTokenCacheMaxSize(0);
58+
59+
$noCacheStart = hrtime(true);
60+
foreach ($formulas as $formula) {
61+
$calculation->parseFormula($formula);
62+
}
63+
$noCacheNs = hrtime(true) - $noCacheStart;
64+
65+
// --- Run 2: Cache enabled, cold ---
66+
$calculation->setFormulaTokenCacheMaxSize(self::CACHE_SIZE);
67+
68+
$coldStart = hrtime(true);
69+
foreach ($formulas as $formula) {
70+
$calculation->parseFormula($formula);
71+
}
72+
$coldNs = hrtime(true) - $coldStart;
73+
$cacheSize = $calculation->getFormulaTokenCacheSize();
74+
75+
// --- Run 3: Cache enabled, warm ---
76+
$warmStart = hrtime(true);
77+
foreach ($formulas as $formula) {
78+
$calculation->parseFormula($formula);
79+
}
80+
$warmNs = hrtime(true) - $warmStart;
81+
82+
$noCacheMs = $noCacheNs / 1_000_000;
83+
$coldMs = $coldNs / 1_000_000;
84+
$warmMs = $warmNs / 1_000_000;
85+
86+
fwrite(STDERR, "\n");
87+
fwrite(STDERR, "=== parseFormula() Enabled vs Disabled ({$cellCount} formulas) ===\n");
88+
fwrite(STDERR, sprintf(" PHP version: %s (%s)\n", PHP_VERSION, PHP_OS));
89+
fwrite(STDERR, sprintf(" Cache disabled: %8.2f ms\n", $noCacheMs));
90+
fwrite(STDERR, sprintf(" Cache enabled (cold): %8.2f ms\n", $coldMs));
91+
fwrite(STDERR, sprintf(" Cache enabled (warm): %8.2f ms\n", $warmMs));
92+
fwrite(STDERR, sprintf(" Cache entries: %d\n", $cacheSize));
93+
fwrite(STDERR, "\n");
94+
95+
self::assertGreaterThan(0, $cacheSize);
96+
self::assertLessThan($noCacheMs, $warmMs, 'Warm cache should be faster than no cache');
97+
98+
$spreadsheet->disconnectWorksheets();
99+
}
100+
101+
/**
102+
* Benchmark: directly parse 10,000 formulas (mix of repeated and unique)
103+
* with cache enabled vs after clearing the cache.
104+
*
105+
* The formula set is designed so the total distinct formulas stay well
106+
* under the cache limit, ensuring cache hits are reliable.
107+
*/
108+
public function testParseFormulaCacheVsNocache(): void
109+
{
110+
$spreadsheet = new Spreadsheet();
111+
$calculation = Calculation::getInstance($spreadsheet);
112+
$calculation->setFormulaTokenCacheMaxSize(self::CACHE_SIZE);
113+
$totalFormulas = 10_000;
114+
115+
// Build a pool of 200 distinct formulas (well under the cache limit).
116+
// Each formula will be parsed ~50 times on average across 10,000 calls.
117+
$distinctPool = [];
118+
$patternCount = count(self::FORMULA_PATTERNS);
119+
for ($i = 1; $i <= 200; ++$i) {
120+
$pattern = self::FORMULA_PATTERNS[$i % $patternCount];
121+
$distinctPool[] = $this->buildFormula($pattern, $i);
122+
}
123+
124+
$formulas = [];
125+
for ($i = 0; $i < $totalFormulas; ++$i) {
126+
$formulas[] = $distinctPool[$i % count($distinctPool)];
127+
}
128+
129+
// --- Run 1: Cold cache (every formula must be fully parsed) ---
130+
$calculation->clearFormulaTokenCache();
131+
132+
$coldStart = hrtime(true);
133+
foreach ($formulas as $formula) {
134+
$calculation->parseFormula($formula);
135+
}
136+
$coldNs = hrtime(true) - $coldStart;
137+
138+
// --- Run 2: Warm cache (repeated formulas served from cache) ---
139+
$cacheSize = $calculation->getFormulaTokenCacheSize();
140+
141+
$warmStart = hrtime(true);
142+
foreach ($formulas as $formula) {
143+
$calculation->parseFormula($formula);
144+
}
145+
$warmNs = hrtime(true) - $warmStart;
146+
147+
// --- Run 3: Cleared cache (simulates re-parsing) ---
148+
$calculation->clearFormulaTokenCache();
149+
150+
$clearedStart = hrtime(true);
151+
foreach ($formulas as $formula) {
152+
$calculation->parseFormula($formula);
153+
}
154+
$clearedNs = hrtime(true) - $clearedStart;
155+
156+
$coldMs = $coldNs / 1_000_000;
157+
$warmMs = $warmNs / 1_000_000;
158+
$clearedMs = $clearedNs / 1_000_000;
159+
$warmVsColdPct = $coldMs > 0 ? (($coldMs - $warmMs) / $coldMs) * 100 : 0;
160+
$warmVsClearedPct = $clearedMs > 0 ? (($clearedMs - $warmMs) / $clearedMs) * 100 : 0;
161+
162+
fwrite(STDERR, "\n");
163+
$distinctCount = count($distinctPool);
164+
fwrite(STDERR, "=== parseFormula() Benchmark ({$totalFormulas} calls, {$distinctCount} distinct) ===\n");
165+
fwrite(STDERR, sprintf(" PHP version: %s (%s)\n", PHP_VERSION, PHP_OS));
166+
fwrite(STDERR, sprintf(" Cold cache (first parse): %8.2f ms\n", $coldMs));
167+
fwrite(STDERR, sprintf(" Warm cache (all cached): %8.2f ms\n", $warmMs));
168+
fwrite(STDERR, sprintf(" Cleared cache (re-parse): %8.2f ms\n", $clearedMs));
169+
fwrite(STDERR, sprintf(" Warm vs cold improvement: %8.2f %%\n", $warmVsColdPct));
170+
fwrite(STDERR, sprintf(" Warm vs cleared improvement:%7.2f %%\n", $warmVsClearedPct));
171+
fwrite(STDERR, sprintf(" Cache entries after cold: %d\n", $cacheSize));
172+
fwrite(STDERR, sprintf(" Cache entries after clear: %d\n", $calculation->getFormulaTokenCacheSize()));
173+
fwrite(STDERR, "\n");
174+
175+
// Warm cache should be faster than cold cache for repeated formulas
176+
self::assertLessThan($coldMs, $warmMs, 'Warm cache should be faster than cold cache');
177+
self::assertLessThan($clearedMs, $warmMs, 'Warm cache should be faster than cleared cache');
178+
179+
$spreadsheet->disconnectWorksheets();
180+
}
181+
182+
/**
183+
* Build a concrete formula from a pattern and row number.
184+
*
185+
* Patterns use %d placeholders; all are replaced with the row number.
186+
*/
187+
private function buildFormula(string $pattern, int $row): string
188+
{
189+
return sprintf(
190+
$pattern,
191+
...array_fill(0, substr_count($pattern, '%d'), $row)
192+
);
193+
}
194+
}

0 commit comments

Comments
 (0)