fix(chunkers): address research audit findings

waleedlatif1 · waleedlatif1 · commit 25abb8a343eb · 2026-04-10T17:55:30.000-07:00
- Expand RecursiveChunker recipes: markdown adds horizontal rules, code
  fences, blockquotes; code adds const/let/var/if/for/while/switch/return
- RecursiveChunker fallback uses splitAtWordBoundaries instead of char slicing
- RegexChunker ReDoS test uses adversarial strings (repeated chars, spaces)
- SentenceChunker abbreviation list adds St/Rev/Gen/No/Fig/Vol/months
  and single-capital-letter lookbehind
- Add overlap &lt; maxSize validation in Zod schema and UI form
- Add pattern max length (500) validation in Zod schema
- Fix StructuredDataChunker footer grammar
diff --git a/apps/sim/app/api/knowledge/route.ts b/apps/sim/app/api/knowledge/route.ts
@@ -45,8 +45,8 @@ const CreateKnowledgeBaseSchema = z.object({
       /** Strategy-specific options */
       strategyOptions: z
         .object({
-          /** Regex pattern for 'regex' strategy */
-          pattern: z.string().optional(),
+          /** Regex pattern for 'regex' strategy (max 500 chars) */
+          pattern: z.string().max(500).optional(),
           /** Custom separator hierarchy for 'recursive' strategy */
           separators: z.array(z.string()).optional(),
           /** Pre-built separator recipe for 'recursive' strategy */
@@ -68,6 +68,14 @@ const CreateKnowledgeBaseSchema = z.object({
         message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
       }
     )
+    .refine(
+      (data) => {
+        return data.overlap < data.maxSize
+      },
+      {
+        message: 'Overlap must be less than max chunk size',
+      }
+    )
     .refine(
       (data) => {
         if (data.strategy === 'regex' && !data.strategyOptions?.pattern) {
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
@@ -3,7 +3,7 @@
 import { memo, useEffect, useRef, useState } from 'react'
 import { zodResolver } from '@hookform/resolvers/zod'
 import { createLogger } from '@sim/logger'
-import { Loader2, RotateCcw, X } from 'lucide-react'
+import { ChevronDown, Loader2, RotateCcw, X } from 'lucide-react'
 import { useParams } from 'next/navigation'
 import { useForm } from 'react-hook-form'
 import { z } from 'zod'
@@ -92,6 +92,15 @@ const FormSchema = z
       path: ['minChunkSize'],
     }
   )
+  .refine(
+    (data) => {
+      return data.overlapSize < data.maxChunkSize
+    },
+    {
+      message: 'Overlap must be less than max chunk size',
+      path: ['overlapSize'],
+    }
+  )
   .refine(
     (data) => {
       if (data.strategy === 'regex' && !data.regexPattern?.trim()) {
@@ -469,6 +478,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
                       >
                         {STRATEGY_OPTIONS.find((o) => o.value === strategyValue)?.label ??
                           'Auto (detect from content)'}
+                        <ChevronDown className='h-[12px] w-[12px] text-[var(--text-icon)]' />
                       </Button>
                     </DropdownMenuTrigger>
                     <DropdownMenuContent align='start' className='w-[var(--radix-dropdown-menu-trigger-width)]'>
diff --git a/apps/sim/lib/chunkers/recursive-chunker.ts b/apps/sim/lib/chunkers/recursive-chunker.ts
@@ -6,6 +6,7 @@ import {
   cleanText,
   estimateTokens,
   resolveChunkerOptions,
+  splitAtWordBoundaries,
   tokensToChars,
 } from '@/lib/chunkers/utils'
 
@@ -14,19 +15,41 @@ const logger = createLogger('RecursiveChunker')
 const RECIPES = {
   plain: ['\n\n', '\n', '. ', ' ', ''],
   markdown: [
+    '\n---\n',
+    '\n***\n',
+    '\n___\n',
     '\n# ',
     '\n## ',
     '\n### ',
     '\n#### ',
     '\n##### ',
     '\n###### ',
+    '\n```\n',
+    '\n> ',
     '\n\n',
     '\n',
     '. ',
     ' ',
     '',
   ],
-  code: ['\nfunction ', '\nclass ', '\nexport ', '\n\n', '\n', '; ', ' ', ''],
+  code: [
+    '\nfunction ',
+    '\nclass ',
+    '\nexport ',
+    '\nconst ',
+    '\nlet ',
+    '\nvar ',
+    '\nif ',
+    '\nfor ',
+    '\nwhile ',
+    '\nswitch ',
+    '\nreturn ',
+    '\n\n',
+    '\n',
+    '; ',
+    ' ',
+    '',
+  ],
 } as const
 
 /**
@@ -61,16 +84,8 @@ export class RecursiveChunker {
     }
 
     if (separatorIndex >= this.separators.length) {
-      const chunks: string[] = []
-      const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount)
-
-      for (let i = 0; i < text.length; i += targetLength) {
-        const chunk = text.slice(i, i + targetLength).trim()
-        if (chunk) {
-          chunks.push(chunk)
-        }
-      }
-      return chunks
+      const chunkSizeChars = tokensToChars(this.chunkSize)
+      return splitAtWordBoundaries(text, chunkSizeChars)
     }
 
     const separator = this.separators[separatorIndex]
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -43,13 +43,20 @@ export class RegexChunker {
     try {
       const regex = new RegExp(pattern, 'g')
 
-      // Test against a mixed-character string to catch catastrophic backtracking
-      const testStr = 'aB1 xY2\n'.repeat(1250)
-      const start = Date.now()
-      regex.test(testStr)
-      const elapsed = Date.now() - start
-      if (elapsed > 50) {
-        throw new Error('Regex pattern appears to have catastrophic backtracking')
+      // Test against adversarial strings to catch catastrophic backtracking
+      const testStrings = [
+        'a'.repeat(10000),
+        ' '.repeat(10000),
+        'a '.repeat(5000),
+        'aB1 xY2\n'.repeat(1250),
+      ]
+      for (const testStr of testStrings) {
+        const start = Date.now()
+        regex.test(testStr)
+        const elapsed = Date.now() - start
+        if (elapsed > 50) {
+          throw new Error('Regex pattern appears to have catastrophic backtracking')
+        }
       }
 
       regex.lastIndex = 0
diff --git a/apps/sim/lib/chunkers/sentence-chunker.ts b/apps/sim/lib/chunkers/sentence-chunker.ts
@@ -34,7 +34,7 @@ export class SentenceChunker {
    */
   private splitSentences(text: string): string[] {
     return text
-      .split(/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|Inc|Ltd|Corp|approx|dept|est|govt|i\.e|e\.g))(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)
+      .split(/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Rev|Gen|Sgt|No|Fig|Vol|Ch|vs|etc|Inc|Ltd|Corp|approx|dept|est|govt|Jan|Feb|Mar|Apr|Aug|Sep|Oct|Nov|Dec|i\.e|e\.g))(?<![A-Z])(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)
       .filter((s) => s.trim().length > 0)
   }
 
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -113,7 +113,7 @@ export class StructuredDataChunker {
     }
 
     content += rows.join('\n')
-    content += `\n\n[Rows ${rows.length} of data]`
+    content += `\n\n[${rows.length} rows of data]`
 
     return content
   }

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ export class SentenceChunker {`
`34`	`34`	`*/`
`35`	`35`	`private splitSentences(text: string): string[] {`
`36`	`36`	`return text`
`37`		`- .split(/(?<!\b(?:Mr\|Mrs\|Ms\|Dr\|Prof\|Sr\|Jr\|vs\|etc\|Inc\|Ltd\|Corp\|approx\|dept\|est\|govt\|i\.e\|e\.g))(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)`
	`37`	`+ .split(/(?<!\b(?:Mr\|Mrs\|Ms\|Dr\|Prof\|Sr\|Jr\|St\|Rev\|Gen\|Sgt\|No\|Fig\|Vol\|Ch\|vs\|etc\|Inc\|Ltd\|Corp\|approx\|dept\|est\|govt\|Jan\|Feb\|Mar\|Apr\|Aug\|Sep\|Oct\|Nov\|Dec\|i\.e\|e\.g))(?<![A-Z])(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)`
`38`	`38`	`.filter((s) => s.trim().length > 0)`
`39`	`39`	`}`
`40`	`40`
Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ export class StructuredDataChunker {`
`113`	`113`	`}`
`114`	`114`
`115`	`115`	`content += rows.join('\n')`
`116`		- content += `\n\n[Rows ${rows.length} of data]`
	`116`	+ content += `\n\n[${rows.length} rows of data]`
`117`	`117`
`118`	`118`	`return content`
`119`	`119`	`}`