Skip to content

Commit 59f86e9

Browse files
committed
fix(chunkers): standardize token estimation and use emcn dropdown
- Refactor all existing chunkers (Text, JsonYaml, StructuredData, Docs) to use shared utils - Fix inconsistent token estimation (JsonYaml used tiktoken, StructuredData used /3 ratio) - Fix DocsChunker operator precedence bug and hard-coded 300-token limit - Fix JsonYamlChunker isStructuredData false positive on plain strings - Add MAX_DEPTH recursion guard to JsonYamlChunker - Replace @/components/ui/select with emcn DropdownMenu in strategy selector
1 parent 9f83f87 commit 59f86e9

File tree

5 files changed

+192
-430
lines changed

5 files changed

+192
-430
lines changed

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ import { useForm } from 'react-hook-form'
99
import { z } from 'zod'
1010
import {
1111
Button,
12+
DropdownMenu,
13+
DropdownMenuContent,
14+
DropdownMenuRadioGroup,
15+
DropdownMenuRadioItem,
16+
DropdownMenuTrigger,
1217
Input,
1318
Label,
1419
Modal,
@@ -18,13 +23,6 @@ import {
1823
ModalHeader,
1924
Textarea,
2025
} from '@/components/emcn'
21-
import {
22-
Select,
23-
SelectContent,
24-
SelectItem,
25-
SelectTrigger,
26-
SelectValue,
27-
} from '@/components/ui/select'
2826
import type { StrategyOptions } from '@/lib/chunkers/types'
2927
import { cn } from '@/lib/core/utils/cn'
3028
import { formatFileSize, validateKnowledgeBaseFile } from '@/lib/uploads/utils/file-utils'
@@ -461,24 +459,33 @@ export const CreateBaseModal = memo(function CreateBaseModal({
461459
</div>
462460

463461
<div className='flex flex-col gap-2'>
464-
<Label htmlFor='strategy'>Chunking Strategy</Label>
465-
<Select
466-
value={strategyValue}
467-
onValueChange={(value) =>
468-
setValue('strategy', value as FormValues['strategy'])
469-
}
470-
>
471-
<SelectTrigger id='strategy'>
472-
<SelectValue placeholder='Auto (detect from content)' />
473-
</SelectTrigger>
474-
<SelectContent>
475-
{STRATEGY_OPTIONS.map((option) => (
476-
<SelectItem key={option.value} value={option.value}>
477-
{option.label}
478-
</SelectItem>
479-
))}
480-
</SelectContent>
481-
</Select>
462+
<Label>Chunking Strategy</Label>
463+
<DropdownMenu>
464+
<DropdownMenuTrigger asChild>
465+
<Button
466+
type='button'
467+
variant='default'
468+
className='w-full justify-between border border-[var(--border-1)] !bg-[var(--surface-1)] font-normal'
469+
>
470+
{STRATEGY_OPTIONS.find((o) => o.value === strategyValue)?.label ??
471+
'Auto (detect from content)'}
472+
</Button>
473+
</DropdownMenuTrigger>
474+
<DropdownMenuContent align='start' className='w-[var(--radix-dropdown-menu-trigger-width)]'>
475+
<DropdownMenuRadioGroup
476+
value={strategyValue}
477+
onValueChange={(value) =>
478+
setValue('strategy', value as FormValues['strategy'])
479+
}
480+
>
481+
{STRATEGY_OPTIONS.map((option) => (
482+
<DropdownMenuRadioItem key={option.value} value={option.value}>
483+
{option.label}
484+
</DropdownMenuRadioItem>
485+
))}
486+
</DropdownMenuRadioGroup>
487+
</DropdownMenuContent>
488+
</DropdownMenu>
482489
<p className='text-[var(--text-muted)] text-xs'>
483490
Auto detects the best strategy based on file content type.
484491
</p>

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ import path from 'path'
33
import { createLogger } from '@sim/logger'
44
import { TextChunker } from '@/lib/chunkers/text-chunker'
55
import type { DocChunk, DocsChunkerOptions } from '@/lib/chunkers/types'
6+
import { estimateTokens } from '@/lib/chunkers/utils'
67
import { generateEmbeddings } from '@/lib/knowledge/embeddings'
78

89
interface HeaderInfo {
910
level: number
1011
text: string
11-
slug?: string
1212
anchor?: string
1313
position?: number
1414
}
@@ -27,10 +27,12 @@ const logger = createLogger('DocsChunker')
2727
export class DocsChunker {
2828
private readonly textChunker: TextChunker
2929
private readonly baseUrl: string
30+
private readonly chunkSize: number
3031

3132
constructor(options: DocsChunkerOptions = {}) {
33+
this.chunkSize = options.chunkSize ?? 300
3234
this.textChunker = new TextChunker({
33-
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
35+
chunkSize: this.chunkSize,
3436
minCharactersPerChunk: options.minCharactersPerChunk ?? 1,
3537
chunkOverlap: options.chunkOverlap ?? 50,
3638
})
@@ -97,7 +99,7 @@ export class DocsChunker {
9799

98100
const chunk: DocChunk = {
99101
text: chunkText,
100-
tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
102+
tokenCount: estimateTokens(chunkText),
101103
sourceDocument: relativePath,
102104
headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl,
103105
headerText: relevantHeader?.text || frontmatter.title || 'Document Root',
@@ -170,28 +172,23 @@ export class DocsChunker {
170172
private generateAnchor(headerText: string): string {
171173
return headerText
172174
.toLowerCase()
173-
.replace(/[^\w\s-]/g, '') // Remove special characters except hyphens
174-
.replace(/\s+/g, '-') // Replace spaces with hyphens
175-
.replace(/-+/g, '-') // Replace multiple hyphens with single
176-
.replace(/^-|-$/g, '') // Remove leading/trailing hyphens
175+
.replace(/[^\w\s-]/g, '')
176+
.replace(/\s+/g, '-')
177+
.replace(/-+/g, '-')
178+
.replace(/^-|-$/g, '')
177179
}
178180

179181
/**
180182
* Generate document URL from relative path
181183
* Handles index.mdx files specially - they are served at the parent directory path
182184
*/
183185
private generateDocumentUrl(relativePath: string): string {
184-
// Convert file path to URL path
185-
// e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
186-
// e.g., "triggers/index.mdx" -> "/triggers" (NOT "/triggers/index")
187-
let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths
186+
let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/')
188187

189-
// In fumadocs, index.mdx files are served at the parent directory path
190-
// e.g., "triggers/index" -> "triggers"
191188
if (urlPath.endsWith('/index')) {
192-
urlPath = urlPath.slice(0, -6) // Remove "/index"
189+
urlPath = urlPath.slice(0, -6)
193190
} else if (urlPath === 'index') {
194-
urlPath = '' // Root index.mdx
191+
urlPath = ''
195192
}
196193

197194
return `${this.baseUrl}/${urlPath}`
@@ -243,12 +240,11 @@ export class DocsChunker {
243240
private cleanContent(content: string): string {
244241
return (
245242
content
246-
// Remove import statements
243+
.replace(/\r\n/g, '\n')
244+
.replace(/\r/g, '\n')
247245
.replace(/^import\s+.*$/gm, '')
248-
// Remove JSX components and React-style comments
249246
.replace(/<[^>]+>/g, ' ')
250247
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
251-
// Remove excessive whitespace
252248
.replace(/\n{3,}/g, '\n\n')
253249
.replace(/[ \t]{2,}/g, ' ')
254250
.trim()
@@ -285,13 +281,6 @@ export class DocsChunker {
285281
return { data, content: markdownContent }
286282
}
287283

288-
/**
289-
* Estimate token count (rough approximation)
290-
*/
291-
private estimateTokens(text: string): number {
292-
return Math.ceil(text.length / 4)
293-
}
294-
295284
/**
296285
* Detect table boundaries in markdown content to avoid splitting them
297286
*/
@@ -314,7 +303,7 @@ export class DocsChunker {
314303
} else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
315304
tables.push({
316305
start: this.getCharacterPosition(lines, tableStart),
317-
end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0,
306+
end: this.getCharacterPosition(lines, i - 1) + (lines[i - 1]?.length ?? 0),
318307
})
319308
inTable = false
320309
}
@@ -354,6 +343,10 @@ export class DocsChunker {
354343

355344
for (const chunk of chunks) {
356345
const chunkStart = originalContent.indexOf(chunk, currentPosition)
346+
if (chunkStart === -1) {
347+
mergedChunks.push(chunk)
348+
continue
349+
}
357350
const chunkEnd = chunkStart + chunk.length
358351

359352
const intersectsTable = tableBoundaries.some(
@@ -373,10 +366,10 @@ export class DocsChunker {
373366

374367
const minStart = Math.min(chunkStart, ...affectedTables.map((t) => t.start))
375368
const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end))
376-
const completeChunk = originalContent.slice(minStart, maxEnd)
369+
const completeChunk = originalContent.slice(minStart, maxEnd).trim()
377370

378-
if (!mergedChunks.some((existing) => existing.includes(completeChunk.trim()))) {
379-
mergedChunks.push(completeChunk.trim())
371+
if (completeChunk && !mergedChunks.some((existing) => existing.includes(completeChunk))) {
372+
mergedChunks.push(completeChunk)
380373
}
381374
} else {
382375
mergedChunks.push(chunk)
@@ -389,15 +382,15 @@ export class DocsChunker {
389382
}
390383

391384
/**
392-
* Enforce 300 token size limit on chunks
385+
* Enforce token size limit on chunks, using the configured chunkSize
393386
*/
394387
private enforceSizeLimit(chunks: string[]): string[] {
395388
const finalChunks: string[] = []
396389

397390
for (const chunk of chunks) {
398-
const tokens = this.estimateTokens(chunk)
391+
const tokens = estimateTokens(chunk)
399392

400-
if (tokens <= 300) {
393+
if (tokens <= this.chunkSize) {
401394
finalChunks.push(chunk)
402395
} else {
403396
const lines = chunk.split('\n')
@@ -406,7 +399,7 @@ export class DocsChunker {
406399
for (const line of lines) {
407400
const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
408401

409-
if (this.estimateTokens(testChunk) <= 300) {
402+
if (estimateTokens(testChunk) <= this.chunkSize) {
410403
currentChunk = testChunk
411404
} else {
412405
if (currentChunk.trim()) {

0 commit comments

Comments
 (0)