Skip to content

Commit 25abb8a

Browse files
committed
fix(chunkers): address research audit findings
- Expand RecursiveChunker recipes: markdown adds horizontal rules, code fences, blockquotes; code adds const/let/var/if/for/while/switch/return - RecursiveChunker fallback uses splitAtWordBoundaries instead of char slicing - RegexChunker ReDoS test uses adversarial strings (repeated chars, spaces) - SentenceChunker abbreviation list adds St/Rev/Gen/No/Fig/Vol/months and single-capital-letter lookbehind - Add overlap < maxSize validation in Zod schema and UI form - Add pattern max length (500) validation in Zod schema - Fix StructuredDataChunker footer grammar
1 parent 59f86e9 commit 25abb8a

File tree

6 files changed

+63
-23
lines changed

6 files changed

+63
-23
lines changed

apps/sim/app/api/knowledge/route.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ const CreateKnowledgeBaseSchema = z.object({
4545
/** Strategy-specific options */
4646
strategyOptions: z
4747
.object({
48-
/** Regex pattern for 'regex' strategy */
49-
pattern: z.string().optional(),
48+
/** Regex pattern for 'regex' strategy (max 500 chars) */
49+
pattern: z.string().max(500).optional(),
5050
/** Custom separator hierarchy for 'recursive' strategy */
5151
separators: z.array(z.string()).optional(),
5252
/** Pre-built separator recipe for 'recursive' strategy */
@@ -68,6 +68,14 @@ const CreateKnowledgeBaseSchema = z.object({
6868
message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
6969
}
7070
)
71+
.refine(
72+
(data) => {
73+
return data.overlap < data.maxSize
74+
},
75+
{
76+
message: 'Overlap must be less than max chunk size',
77+
}
78+
)
7179
.refine(
7280
(data) => {
7381
if (data.strategy === 'regex' && !data.strategyOptions?.pattern) {

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import { memo, useEffect, useRef, useState } from 'react'
44
import { zodResolver } from '@hookform/resolvers/zod'
55
import { createLogger } from '@sim/logger'
6-
import { Loader2, RotateCcw, X } from 'lucide-react'
6+
import { ChevronDown, Loader2, RotateCcw, X } from 'lucide-react'
77
import { useParams } from 'next/navigation'
88
import { useForm } from 'react-hook-form'
99
import { z } from 'zod'
@@ -92,6 +92,15 @@ const FormSchema = z
9292
path: ['minChunkSize'],
9393
}
9494
)
95+
.refine(
96+
(data) => {
97+
return data.overlapSize < data.maxChunkSize
98+
},
99+
{
100+
message: 'Overlap must be less than max chunk size',
101+
path: ['overlapSize'],
102+
}
103+
)
95104
.refine(
96105
(data) => {
97106
if (data.strategy === 'regex' && !data.regexPattern?.trim()) {
@@ -469,6 +478,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
469478
>
470479
{STRATEGY_OPTIONS.find((o) => o.value === strategyValue)?.label ??
471480
'Auto (detect from content)'}
481+
<ChevronDown className='h-[12px] w-[12px] text-[var(--text-icon)]' />
472482
</Button>
473483
</DropdownMenuTrigger>
474484
<DropdownMenuContent align='start' className='w-[var(--radix-dropdown-menu-trigger-width)]'>

apps/sim/lib/chunkers/recursive-chunker.ts

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
cleanText,
77
estimateTokens,
88
resolveChunkerOptions,
9+
splitAtWordBoundaries,
910
tokensToChars,
1011
} from '@/lib/chunkers/utils'
1112

@@ -14,19 +15,41 @@ const logger = createLogger('RecursiveChunker')
1415
const RECIPES = {
1516
plain: ['\n\n', '\n', '. ', ' ', ''],
1617
markdown: [
18+
'\n---\n',
19+
'\n***\n',
20+
'\n___\n',
1721
'\n# ',
1822
'\n## ',
1923
'\n### ',
2024
'\n#### ',
2125
'\n##### ',
2226
'\n###### ',
27+
'\n```\n',
28+
'\n> ',
2329
'\n\n',
2430
'\n',
2531
'. ',
2632
' ',
2733
'',
2834
],
29-
code: ['\nfunction ', '\nclass ', '\nexport ', '\n\n', '\n', '; ', ' ', ''],
35+
code: [
36+
'\nfunction ',
37+
'\nclass ',
38+
'\nexport ',
39+
'\nconst ',
40+
'\nlet ',
41+
'\nvar ',
42+
'\nif ',
43+
'\nfor ',
44+
'\nwhile ',
45+
'\nswitch ',
46+
'\nreturn ',
47+
'\n\n',
48+
'\n',
49+
'; ',
50+
' ',
51+
'',
52+
],
3053
} as const
3154

3255
/**
@@ -61,16 +84,8 @@ export class RecursiveChunker {
6184
}
6285

6386
if (separatorIndex >= this.separators.length) {
64-
const chunks: string[] = []
65-
const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount)
66-
67-
for (let i = 0; i < text.length; i += targetLength) {
68-
const chunk = text.slice(i, i + targetLength).trim()
69-
if (chunk) {
70-
chunks.push(chunk)
71-
}
72-
}
73-
return chunks
87+
const chunkSizeChars = tokensToChars(this.chunkSize)
88+
return splitAtWordBoundaries(text, chunkSizeChars)
7489
}
7590

7691
const separator = this.separators[separatorIndex]

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,20 @@ export class RegexChunker {
4343
try {
4444
const regex = new RegExp(pattern, 'g')
4545

46-
// Test against a mixed-character string to catch catastrophic backtracking
47-
const testStr = 'aB1 xY2\n'.repeat(1250)
48-
const start = Date.now()
49-
regex.test(testStr)
50-
const elapsed = Date.now() - start
51-
if (elapsed > 50) {
52-
throw new Error('Regex pattern appears to have catastrophic backtracking')
46+
// Test against adversarial strings to catch catastrophic backtracking
47+
const testStrings = [
48+
'a'.repeat(10000),
49+
' '.repeat(10000),
50+
'a '.repeat(5000),
51+
'aB1 xY2\n'.repeat(1250),
52+
]
53+
for (const testStr of testStrings) {
54+
const start = Date.now()
55+
regex.test(testStr)
56+
const elapsed = Date.now() - start
57+
if (elapsed > 50) {
58+
throw new Error('Regex pattern appears to have catastrophic backtracking')
59+
}
5360
}
5461

5562
regex.lastIndex = 0

apps/sim/lib/chunkers/sentence-chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ export class SentenceChunker {
3434
*/
3535
private splitSentences(text: string): string[] {
3636
return text
37-
.split(/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|Inc|Ltd|Corp|approx|dept|est|govt|i\.e|e\.g))(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)
37+
.split(/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Rev|Gen|Sgt|No|Fig|Vol|Ch|vs|etc|Inc|Ltd|Corp|approx|dept|est|govt|Jan|Feb|Mar|Apr|Aug|Sep|Oct|Nov|Dec|i\.e|e\.g))(?<![A-Z])(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)
3838
.filter((s) => s.trim().length > 0)
3939
}
4040

apps/sim/lib/chunkers/structured-data-chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ export class StructuredDataChunker {
113113
}
114114

115115
content += rows.join('\n')
116-
content += `\n\n[Rows ${rows.length} of data]`
116+
content += `\n\n[${rows.length} rows of data]`
117117

118118
return content
119119
}

0 commit comments

Comments
 (0)