Skip to content

Commit 401d801

Browse files
committed
fix(kb): fix race condition in stuck document retry during sync
The stuck document retry at the end of each sync was querying for all documents with processingStatus 'pending' or 'failed'. This included documents added in the CURRENT sync that were still processing asynchronously, causing duplicate concurrent processing attempts. The race between the original (correct) processing and the retry (which reads the raw title from DB as filename) produced nondeterministic failures — some documents would succeed while others would fail with "Unsupported file type: <meeting title>". Fixes: - Filter stuck doc query by uploadedAt < syncStartedAt to exclude documents from the current sync - Pass mimeType through to parseHttpFile and use existing getExtensionFromMimeType utility as fallback when filename has no extension (e.g. Fireflies meeting titles) - Apply same mimeType fallback in parseDataURI for consistency
1 parent d5a3ce2 commit 401d801

File tree

1 file changed

+9
-21
lines changed

1 file changed

+9
-21
lines changed

apps/sim/lib/knowledge/documents/document-processor.ts

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import { parseBuffer, parseFile } from '@/lib/file-parsers'
77
import type { FileParseMetadata } from '@/lib/file-parsers/types'
88
import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
99
import { StorageService } from '@/lib/uploads'
10-
import { isInternalFileUrl } from '@/lib/uploads/utils/file-utils'
10+
import { getExtensionFromMimeType, isInternalFileUrl } from '@/lib/uploads/utils/file-utils'
1111
import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
1212
import { mistralParserTool } from '@/tools/mistral/parser'
1313

@@ -759,38 +759,26 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
759759
: decodeURIComponent(base64Data)
760760
}
761761

762-
const extension = filename.split('.').pop()?.toLowerCase() || 'txt'
762+
const extension = filename.includes('.')
763+
? filename.split('.').pop()!.toLowerCase()
764+
: getExtensionFromMimeType(mimeType) ?? 'txt'
763765
const buffer = Buffer.from(base64Data, 'base64')
764766
const result = await parseBuffer(buffer, extension)
765767
return result.content
766768
}
767769

768-
const MIME_TO_EXTENSION: Record<string, string> = {
769-
'text/plain': 'txt',
770-
'text/markdown': 'md',
771-
'text/csv': 'csv',
772-
'text/html': 'html',
773-
'application/pdf': 'pdf',
774-
'application/json': 'json',
775-
'application/yaml': 'yaml',
776-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
777-
'application/msword': 'doc',
778-
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
779-
'application/vnd.ms-excel': 'xls',
780-
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
781-
'application/vnd.ms-powerpoint': 'ppt',
782-
}
783-
784770
async function parseHttpFile(
785771
fileUrl: string,
786772
filename: string,
787773
mimeType?: string
788774
): Promise<{ content: string; metadata?: FileParseMetadata }> {
789775
const buffer = await downloadFileWithTimeout(fileUrl)
790776

791-
let extension = filename.split('.').pop()?.toLowerCase()
792-
if (!extension || extension === filename.toLowerCase()) {
793-
extension = mimeType ? MIME_TO_EXTENSION[mimeType] : undefined
777+
let extension = filename.includes('.')
778+
? filename.split('.').pop()?.toLowerCase()
779+
: undefined
780+
if (!extension && mimeType) {
781+
extension = getExtensionFromMimeType(mimeType) ?? undefined
794782
}
795783
if (!extension) {
796784
throw new Error(`Could not determine file type for: ${filename}`)

0 commit comments

Comments
 (0)