import { Injectable, Logger } from '@nestjs/common'; import * as path from 'path'; import { InjectKysely } from 'nestjs-kysely'; import { KyselyDB } from '@docmost/db/types/kysely.types'; import { cleanUrlString } from '../utils/file.utils'; import { StorageService } from '../../storage/storage.service'; import { createReadStream } from 'node:fs'; import { promises as fs } from 'fs'; import { Readable } from 'stream'; import { getMimeType, sanitizeFileName } from '../../../common/helpers'; import { v7 } from 'uuid'; import { FileTask } from '@docmost/db/types/entity.types'; import { getAttachmentFolderPath } from '../../../core/attachment/attachment.utils'; import { AttachmentType } from '../../../core/attachment/attachment.constants'; import { unwrapFromParagraph } from '../utils/import-formatter'; import { resolveRelativeAttachmentPath } from '../utils/import.utils'; import { load } from 'cheerio'; import pLimit from 'p-limit'; import { InjectQueue } from '@nestjs/bullmq'; import { Queue } from 'bullmq'; import { QueueJob, QueueName } from '../../queue/constants'; interface AttachmentInfo { href: string; fileName: string; mimeType: string; } interface DrawioPair { drawioFile?: AttachmentInfo; pngFile?: AttachmentInfo; baseName: string; } @Injectable() export class ImportAttachmentService { private readonly logger = new Logger(ImportAttachmentService.name); private readonly CONCURRENT_UPLOADS = 3; private readonly MAX_RETRIES = 2; private readonly RETRY_DELAY = 2000; constructor( private readonly storageService: StorageService, @InjectKysely() private readonly db: KyselyDB, @InjectQueue(QueueName.ATTACHMENT_QUEUE) private attachmentQueue: Queue, ) {} async processAttachments(opts: { html: string; pageRelativePath: string; extractDir: string; pageId: string; fileTask: FileTask; attachmentCandidates: Map; pageAttachments?: AttachmentInfo[]; }): Promise { const { html, pageRelativePath, extractDir, pageId, fileTask, attachmentCandidates, pageAttachments = [], } = opts; const attachmentTasks: (() => Promise)[] = []; const limit = pLimit(this.CONCURRENT_UPLOADS); const uploadStats = { total: 0, completed: 0, failed: 0, failedFiles: [] as string[], }; /** * Cache keyed by the *relative* path that appears in the HTML. * Ensures we upload (and DB-insert) each attachment at most once, * even if it's referenced multiple times on the page. */ const processed = new Map< string, { attachmentId: string; storageFilePath: string; apiFilePath: string; fileNameWithExt: string; abs: string; } >(); // Analyze attachments to identify Draw.io pairs const { drawioPairs, skipFiles } = this.analyzeAttachments(pageAttachments); // Map to store processed Draw.io SVGs const drawioSvgMap = new Map< string, { attachmentId: string; apiFilePath: string; fileName: string; } >(); //this.logger.debug(`Found ${drawioPairs.size} Draw.io pairs to process`); // Process Draw.io pairs and create combined SVG files for (const [drawioHref, pair] of drawioPairs) { if (!pair.drawioFile) continue; const drawioAbsPath = attachmentCandidates.get(drawioHref); if (!drawioAbsPath) continue; const pngAbsPath = pair.pngFile ? attachmentCandidates.get(pair.pngFile.href) : undefined; try { // Create combined SVG with Draw.io data and PNG image const svgBuffer = await this.createDrawioSvg(drawioAbsPath, pngAbsPath); // Generate file details - always use "diagram.drawio.svg" as filename const attachmentId = v7(); const fileName = 'diagram.drawio.svg'; const storageFilePath = `${getAttachmentFolderPath( AttachmentType.File, fileTask.workspaceId, )}/${attachmentId}/${fileName}`; const apiFilePath = `/api/files/${attachmentId}/${fileName}`; // Upload the SVG file attachmentTasks.push(async () => { try { const stream = Readable.from(svgBuffer); // Upload to storage await this.storageService.uploadStream(storageFilePath, stream); // Insert into database await this.db .insertInto('attachments') .values({ id: attachmentId, filePath: storageFilePath, fileName: fileName, fileSize: svgBuffer.length, mimeType: 'image/svg+xml', type: 'file', fileExt: '.svg', creatorId: fileTask.creatorId, workspaceId: fileTask.workspaceId, pageId, spaceId: fileTask.spaceId, }) .execute(); uploadStats.completed++; } catch (error) { uploadStats.failed++; uploadStats.failedFiles.push(fileName); this.logger.error( `Failed to upload Draw.io SVG ${fileName}:`, error, ); } }); // Store the mapping for both Draw.io and PNG references drawioSvgMap.set(drawioHref, { attachmentId, apiFilePath, fileName }); if (pair.pngFile) { drawioSvgMap.set(pair.pngFile.href, { attachmentId, apiFilePath, fileName, }); } } catch (error) { this.logger.error( `Failed to process Draw.io pair ${pair.baseName}:`, error, ); } } const uploadOnce = (relPath: string) => { const abs = attachmentCandidates.get(relPath)!; const attachmentId = v7(); const ext = path.extname(abs); const fileNameWithExt = sanitizeFileName(path.basename(abs, ext)) + ext.toLowerCase(); const storageFilePath = `${getAttachmentFolderPath( AttachmentType.File, fileTask.workspaceId, )}/${attachmentId}/${fileNameWithExt}`; const apiFilePath = `/api/files/${attachmentId}/${fileNameWithExt}`; attachmentTasks.push(() => this.uploadWithRetry({ abs, storageFilePath, attachmentId, fileNameWithExt, ext, pageId, fileTask, uploadStats, }), ); return { attachmentId, storageFilePath, apiFilePath, fileNameWithExt, abs, }; }; /** * – Returns cached data if we’ve already processed this path. * – Otherwise calls `uploadOnce`, stores the result, and returns it. */ const processFile = (relPath: string) => { const cached = processed.get(relPath); if (cached) return cached; const fresh = uploadOnce(relPath); processed.set(relPath, fresh); return fresh; }; const pageDir = path.dirname(pageRelativePath); const $ = load(html); // Cache for resolved paths to avoid repeated lookups const resolvedPathCache = new Map(); const getCachedResolvedPath = (rawPath: string): string | null => { if (resolvedPathCache.has(rawPath)) { return resolvedPathCache.get(rawPath)!; } const resolved = resolveRelativeAttachmentPath( rawPath, pageDir, attachmentCandidates, ); resolvedPathCache.set(rawPath, resolved); return resolved; }; // Cache for file stats to avoid repeated file system calls const statCache = new Map(); const getCachedStat = async (absPath: string) => { if (statCache.has(absPath)) { return statCache.get(absPath); } const stat = await fs.stat(absPath); statCache.set(absPath, stat); return stat; }; // Single DOM traversal for all attachment elements const selector = 'img, video, div[data-type="attachment"], a, div[data-type="excalidraw"], div[data-type="drawio"]'; const elements = $(selector).toArray(); for (const element of elements) { const $el = $(element); const tagName = element.tagName.toLowerCase(); // Process based on element type if (tagName === 'img') { const src = cleanUrlString($el.attr('src') ?? ''); if (!src || src.startsWith('http')) continue; const relPath = getCachedResolvedPath(src); if (!relPath) continue; // Check if this image is part of a Draw.io pair const drawioSvg = drawioSvgMap.get(relPath); if (drawioSvg) { const $drawio = $('
') .attr('data-type', 'drawio') .attr('data-src', drawioSvg.apiFilePath) .attr('data-title', 'diagram') .attr('data-width', '100%') .attr('data-align', 'center') .attr('data-attachment-id', drawioSvg.attachmentId); $el.replaceWith($drawio); unwrapFromParagraph($, $drawio); continue; } const { attachmentId, apiFilePath, abs } = processFile(relPath); const stat = await getCachedStat(abs); $el .attr('src', apiFilePath) .attr('data-attachment-id', attachmentId) .attr('data-size', stat.size.toString()) .attr('width', $el.attr('width') ?? '100%') .attr('data-align', $el.attr('data-align') ?? 'center'); unwrapFromParagraph($, $el); } else if (tagName === 'video') { const src = cleanUrlString($el.attr('src') ?? ''); if (!src || src.startsWith('http')) continue; const relPath = getCachedResolvedPath(src); if (!relPath) continue; const { attachmentId, apiFilePath, abs } = processFile(relPath); const stat = await getCachedStat(abs); $el .attr('src', apiFilePath) .attr('data-attachment-id', attachmentId) .attr('data-size', stat.size.toString()) .attr('width', $el.attr('width') ?? '100%') .attr('data-align', $el.attr('data-align') ?? 'center'); unwrapFromParagraph($, $el); } else if (tagName === 'div') { const dataType = $el.attr('data-type'); if (dataType === 'attachment') { const rawUrl = cleanUrlString($el.attr('data-attachment-url') ?? ''); if (!rawUrl || rawUrl.startsWith('http')) continue; const relPath = getCachedResolvedPath(rawUrl); if (!relPath) continue; const { attachmentId, apiFilePath, abs } = processFile(relPath); const stat = await getCachedStat(abs); const fileName = path.basename(abs); const mime = getMimeType(abs); const $newDiv = $('
') .attr('data-type', 'attachment') .attr('data-attachment-url', apiFilePath) .attr('data-attachment-name', fileName) .attr('data-attachment-mime', mime) .attr('data-attachment-size', stat.size.toString()) .attr('data-attachment-id', attachmentId); $el.replaceWith($newDiv); unwrapFromParagraph($, $newDiv); } else if (dataType === 'excalidraw' || dataType === 'drawio') { const rawSrc = cleanUrlString($el.attr('data-src') ?? ''); if (!rawSrc || rawSrc.startsWith('http')) continue; const relPath = getCachedResolvedPath(rawSrc); if (!relPath) continue; const { attachmentId, apiFilePath, abs } = processFile(relPath); const stat = await getCachedStat(abs); const fileName = path.basename(abs); const $newDiv = $('
') .attr('data-type', dataType) .attr('data-src', apiFilePath) .attr('data-title', fileName) .attr('data-width', $el.attr('data-width') || '100%') .attr('data-size', stat.size.toString()) .attr('data-align', $el.attr('data-align') || 'center') .attr('data-attachment-id', attachmentId); $el.replaceWith($newDiv); unwrapFromParagraph($, $newDiv); } } else if (tagName === 'a') { const href = cleanUrlString($el.attr('href') ?? ''); if (!href || href.startsWith('http')) continue; const relPath = getCachedResolvedPath(href); if (!relPath) continue; // Check if this is a Draw.io file const drawioSvg = drawioSvgMap.get(relPath); if (drawioSvg) { const $drawio = $('
') .attr('data-type', 'drawio') .attr('data-src', drawioSvg.apiFilePath) .attr('data-title', 'diagram') .attr('data-width', '100%') .attr('data-align', 'center') .attr('data-attachment-id', drawioSvg.attachmentId); $el.replaceWith($drawio); unwrapFromParagraph($, $drawio); continue; } // Skip files that should be ignored if (skipFiles.has(relPath)) { $el.remove(); continue; } const { attachmentId, apiFilePath, abs } = processFile(relPath); const stat = await getCachedStat(abs); const ext = path.extname(relPath).toLowerCase(); if (ext === '.mp4') { const $video = $('