@@ -3,12 +3,12 @@ import path from 'path'
33import { createLogger } from '@sim/logger'
44import { TextChunker } from '@/lib/chunkers/text-chunker'
55import type { DocChunk , DocsChunkerOptions } from '@/lib/chunkers/types'
6+ import { estimateTokens } from '@/lib/chunkers/utils'
67import { generateEmbeddings } from '@/lib/knowledge/embeddings'
78
89interface HeaderInfo {
910 level : number
1011 text : string
11- slug ?: string
1212 anchor ?: string
1313 position ?: number
1414}
@@ -27,10 +27,12 @@ const logger = createLogger('DocsChunker')
2727export class DocsChunker {
2828 private readonly textChunker : TextChunker
2929 private readonly baseUrl : string
30+ private readonly chunkSize : number
3031
3132 constructor ( options : DocsChunkerOptions = { } ) {
33+ this . chunkSize = options . chunkSize ?? 300
3234 this . textChunker = new TextChunker ( {
33- chunkSize : options . chunkSize ?? 300 , // Max 300 tokens per chunk
35+ chunkSize : this . chunkSize ,
3436 minCharactersPerChunk : options . minCharactersPerChunk ?? 1 ,
3537 chunkOverlap : options . chunkOverlap ?? 50 ,
3638 } )
@@ -97,7 +99,7 @@ export class DocsChunker {
9799
98100 const chunk : DocChunk = {
99101 text : chunkText ,
100- tokenCount : Math . ceil ( chunkText . length / 4 ) , // Simple token estimation
102+ tokenCount : estimateTokens ( chunkText ) ,
101103 sourceDocument : relativePath ,
102104 headerLink : relevantHeader ? `${ documentUrl } #${ relevantHeader . anchor } ` : documentUrl ,
103105 headerText : relevantHeader ?. text || frontmatter . title || 'Document Root' ,
@@ -170,28 +172,23 @@ export class DocsChunker {
170172 private generateAnchor ( headerText : string ) : string {
171173 return headerText
172174 . toLowerCase ( )
173- . replace ( / [ ^ \w \s - ] / g, '' ) // Remove special characters except hyphens
174- . replace ( / \s + / g, '-' ) // Replace spaces with hyphens
175- . replace ( / - + / g, '-' ) // Replace multiple hyphens with single
176- . replace ( / ^ - | - $ / g, '' ) // Remove leading/trailing hyphens
175+ . replace ( / [ ^ \w \s - ] / g, '' )
176+ . replace ( / \s + / g, '-' )
177+ . replace ( / - + / g, '-' )
178+ . replace ( / ^ - | - $ / g, '' )
177179 }
178180
179181 /**
180182 * Generate document URL from relative path
181183 * Handles index.mdx files specially - they are served at the parent directory path
182184 */
183185 private generateDocumentUrl ( relativePath : string ) : string {
184- // Convert file path to URL path
185- // e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
186- // e.g., "triggers/index.mdx" -> "/triggers" (NOT "/triggers/index")
187- let urlPath = relativePath . replace ( / \. m d x $ / , '' ) . replace ( / \\ / g, '/' ) // Handle Windows paths
186+ let urlPath = relativePath . replace ( / \. m d x $ / , '' ) . replace ( / \\ / g, '/' )
188187
189- // In fumadocs, index.mdx files are served at the parent directory path
190- // e.g., "triggers/index" -> "triggers"
191188 if ( urlPath . endsWith ( '/index' ) ) {
192- urlPath = urlPath . slice ( 0 , - 6 ) // Remove "/index"
189+ urlPath = urlPath . slice ( 0 , - 6 )
193190 } else if ( urlPath === 'index' ) {
194- urlPath = '' // Root index.mdx
191+ urlPath = ''
195192 }
196193
197194 return `${ this . baseUrl } /${ urlPath } `
@@ -243,12 +240,11 @@ export class DocsChunker {
243240 private cleanContent ( content : string ) : string {
244241 return (
245242 content
246- // Remove import statements
243+ . replace ( / \r \n / g, '\n' )
244+ . replace ( / \r / g, '\n' )
247245 . replace ( / ^ i m p o r t \s + .* $ / gm, '' )
248- // Remove JSX components and React-style comments
249246 . replace ( / < [ ^ > ] + > / g, ' ' )
250247 . replace ( / \{ \/ \* [ \s \S ] * ?\* \/ \} / g, ' ' )
251- // Remove excessive whitespace
252248 . replace ( / \n { 3 , } / g, '\n\n' )
253249 . replace ( / [ \t ] { 2 , } / g, ' ' )
254250 . trim ( )
@@ -285,13 +281,6 @@ export class DocsChunker {
285281 return { data, content : markdownContent }
286282 }
287283
288- /**
289- * Estimate token count (rough approximation)
290- */
291- private estimateTokens ( text : string ) : number {
292- return Math . ceil ( text . length / 4 )
293- }
294-
295284 /**
296285 * Detect table boundaries in markdown content to avoid splitting them
297286 */
@@ -314,7 +303,7 @@ export class DocsChunker {
314303 } else if ( inTable && ( ! line . includes ( '|' ) || line === '' || line . startsWith ( '#' ) ) ) {
315304 tables . push ( {
316305 start : this . getCharacterPosition ( lines , tableStart ) ,
317- end : this . getCharacterPosition ( lines , i - 1 ) + lines [ i - 1 ] ?. length || 0 ,
306+ end : this . getCharacterPosition ( lines , i - 1 ) + ( lines [ i - 1 ] ?. length ?? 0 ) ,
318307 } )
319308 inTable = false
320309 }
@@ -354,6 +343,10 @@ export class DocsChunker {
354343
355344 for ( const chunk of chunks ) {
356345 const chunkStart = originalContent . indexOf ( chunk , currentPosition )
346+ if ( chunkStart === - 1 ) {
347+ mergedChunks . push ( chunk )
348+ continue
349+ }
357350 const chunkEnd = chunkStart + chunk . length
358351
359352 const intersectsTable = tableBoundaries . some (
@@ -373,10 +366,10 @@ export class DocsChunker {
373366
374367 const minStart = Math . min ( chunkStart , ...affectedTables . map ( ( t ) => t . start ) )
375368 const maxEnd = Math . max ( chunkEnd , ...affectedTables . map ( ( t ) => t . end ) )
376- const completeChunk = originalContent . slice ( minStart , maxEnd )
369+ const completeChunk = originalContent . slice ( minStart , maxEnd ) . trim ( )
377370
378- if ( ! mergedChunks . some ( ( existing ) => existing . includes ( completeChunk . trim ( ) ) ) ) {
379- mergedChunks . push ( completeChunk . trim ( ) )
371+ if ( completeChunk && ! mergedChunks . some ( ( existing ) => existing . includes ( completeChunk ) ) ) {
372+ mergedChunks . push ( completeChunk )
380373 }
381374 } else {
382375 mergedChunks . push ( chunk )
@@ -389,15 +382,15 @@ export class DocsChunker {
389382 }
390383
391384 /**
392- * Enforce 300 token size limit on chunks
385+ * Enforce token size limit on chunks, using the configured chunkSize
393386 */
394387 private enforceSizeLimit ( chunks : string [ ] ) : string [ ] {
395388 const finalChunks : string [ ] = [ ]
396389
397390 for ( const chunk of chunks ) {
398- const tokens = this . estimateTokens ( chunk )
391+ const tokens = estimateTokens ( chunk )
399392
400- if ( tokens <= 300 ) {
393+ if ( tokens <= this . chunkSize ) {
401394 finalChunks . push ( chunk )
402395 } else {
403396 const lines = chunk . split ( '\n' )
@@ -406,7 +399,7 @@ export class DocsChunker {
406399 for ( const line of lines ) {
407400 const testChunk = currentChunk ? `${ currentChunk } \n${ line } ` : line
408401
409- if ( this . estimateTokens ( testChunk ) <= 300 ) {
402+ if ( estimateTokens ( testChunk ) <= this . chunkSize ) {
410403 currentChunk = testChunk
411404 } else {
412405 if ( currentChunk . trim ( ) ) {
0 commit comments