Skip to content

Error Handling

Handle errors gracefully when extracting documents and computing similarity.

Basic Error Handling

Check for errors in extraction results:

ts
import { extract } from 'undms';
import * as fs from 'fs';

const buffer = fs.readFileSync('./document.pdf');

const result = extract([
  {
    name: 'document.pdf',
    size: buffer.length,
    type: 'application/pdf',
    lastModified: Date.now(),
    webkitRelativePath: '',
    buffer,
  },
]);

const doc = result[0]?.documents[0];

if (doc?.error) {
  console.error('Extraction failed:', doc.error);
} else {
  console.log('Extracted content:', doc?.content);
}

Handling Multiple Document Errors

Process results and collect errors:

ts
import { extract } from 'undms';
import * as fs from 'fs';

const files = ['./valid.pdf', './corrupted.pdf', './valid.docx', './empty.txt'];

const documents = files.map((path) => {
  const buffer = fs.readFileSync(path);
  const stats = fs.statSync(path);
  return {
    name: path,
    size: stats.size,
    type: guessMimeType(path),
    lastModified: stats.mtimeMs,
    webkitRelativePath: '',
    buffer,
  };
});

const results = extract(documents);

const errors: { file: string; error: string }[] = [];
const successes: { file: string; wordCount: number }[] = [];

for (const group of results) {
  for (const doc of group.documents) {
    if (doc.error) {
      errors.push({ file: doc.name, error: doc.error });
    } else {
      successes.push({
        file: doc.name,
        wordCount: doc.metadata?.text?.wordCount || 0,
      });
    }
  }
}

console.log(`Processed: ${successes.length} successful, ${errors.length} failed`);

if (errors.length > 0) {
  console.log('\nErrors:');
  errors.forEach(({ file, error }) => console.log(`  ${file}: ${error}`));
}

Retry Logic

Implement retry logic for failed extractions:

ts
import { extract } from 'undms';
import * as fs from 'fs';

interface RetryOptions {
  maxRetries: number;
  initialDelay: number;
  backoffMultiplier: number;
}

async function extractWithRetry(
  filePath: string,
  mimeType: string,
  options: RetryOptions = { maxRetries: 3, initialDelay: 1000, backoffMultiplier: 2 },
) {
  let lastError: Error | null = null;
  let delay = options.initialDelay;

  for (let attempt = 1; attempt <= options.maxRetries; attempt++) {
    try {
      const buffer = fs.readFileSync(filePath);
      const stats = fs.statSync(filePath);

      const result = extract([
        {
          name: filePath,
          size: stats.size,
          type: mimeType,
          lastModified: stats.mtimeMs,
          webkitRelativePath: '',
          buffer,
        },
      ]);

      const doc = result[0]?.documents[0];

      if (doc?.error) {
        throw new Error(doc.error);
      }

      return doc;
    } catch (e) {
      lastError = e instanceof Error ? e : new Error(String(e));
      console.warn(`Attempt ${attempt}/${options.maxRetries} failed: ${lastError.message}`);

      if (attempt < options.maxRetries) {
        await new Promise((r) => setTimeout(r, delay));
        delay *= options.backoffMultiplier;
      }
    }
  }

  throw new Error(`Failed after ${options.maxRetries} attempts: ${lastError?.message}`);
}

Graceful Degradation

Continue processing even when some documents fail:

ts
import { extract } from 'undms';
import * as fs from 'fs';

interface ProcessingResult {
  filename: string;
  success: boolean;
  content?: string;
  metadata?: any;
  error?: string;
  fallback?: string;
}

function processDocuments(files: string[]): ProcessingResult[] {
  return files.map((filePath) => {
    try {
      const buffer = fs.readFileSync(filePath);
      const stats = fs.statSync(filePath);
      const mimeType = guessMimeType(filePath);

      const result = extract([
        {
          name: filePath,
          size: stats.size,
          type: mimeType,
          lastModified: stats.mtimeMs,
          webkitRelativePath: '',
          buffer,
        },
      ]);

      const doc = result[0]?.documents[0];

      if (doc?.error) {
        return {
          filename: filePath,
          success: false,
          error: doc.error,
          fallback: 'Could not extract content',
        };
      }

      return {
        filename: filePath,
        success: true,
        content: doc?.content,
        metadata: doc?.metadata,
      };
    } catch (e) {
      return {
        filename: filePath,
        success: false,
        error: e instanceof Error ? e.message : 'Unknown error',
        fallback: 'File could not be read',
      };
    }
  });
}

// Process and collect results
const results = processDocuments(['./doc1.pdf', './doc2.pdf', './doc3.docx']);

// Use successful results, handle failures
const successful = results.filter((r) => r.success);
const failed = results.filter((r) => !r.success);

console.log(`Success: ${successful.length}, Failed: ${failed.length}`);

Error Recovery Strategies

Different strategies for handling errors:

ts
import { extract } from 'undms';
import * as fs from 'fs';

enum RecoveryStrategy {
  Skip = 'skip',
  Fallback = 'fallback',
  Retry = 'retry',
  Default = 'default',
}

interface DocumentResult {
  name: string;
  content: string;
  strategy: RecoveryStrategy;
  error?: string;
}

function processWithRecovery(
  files: { path: string; type: string; required: boolean }[],
  strategy: RecoveryStrategy,
): DocumentResult[] {
  return files.map(({ path, type, required }) => {
    try {
      const buffer = fs.readFileSync(path);
      const stats = fs.statSync(path);

      const result = extract([
        {
          name: path,
          size: stats.size,
          type,
          lastModified: stats.mtimeMs,
          webkitRelativePath: '',
          buffer,
        },
      ]);

      const doc = result[0]?.documents[0];

      if (doc?.error) {
        return handleError(path, doc.error, required, strategy);
      }

      return {
        name: path,
        content: doc?.content || '',
        strategy: RecoveryStrategy.Skip,
      };
    } catch (e) {
      return handleError(
        path,
        e instanceof Error ? e.message : 'Unknown error',
        required,
        strategy,
      );
    }
  });
}

function handleError(
  path: string,
  error: string,
  required: boolean,
  strategy: RecoveryStrategy,
): DocumentResult {
  switch (strategy) {
    case RecoveryStrategy.Skip:
      return { name: path, content: '', strategy, error };

    case RecoveryStrategy.Fallback:
      return {
        name: path,
        content: '[Content extraction failed - using placeholder]',
        strategy,
        error,
      };

    case RecoveryStrategy.Retry:
      // Retry logic would go here
      return { name: path, content: '', strategy, error };

    case RecoveryStrategy.Default:
      if (required) {
        throw new Error(`Required file ${path} failed: ${error}`);
      }
      return { name: path, content: '', strategy, error };

    default:
      return { name: path, content: '', strategy, error };
  }
}

Logging Errors

Implement proper error logging:

ts
import { extract } from 'undms';
import * as fs from 'fs';

interface ErrorLog {
  timestamp: Date;
  filename: string;
  mimeType: string;
  error: string;
  size: number;
}

const errorLogs: ErrorLog[] = [];

function extractWithLogging(files: { path: string; type: string }[]) {
  const documents = files.map((f) => {
    const buffer = fs.readFileSync(f.path);
    const stats = fs.statSync(f.path);
    return {
      name: f.path,
      size: stats.size,
      type: f.type,
      lastModified: stats.mtimeMs,
      webkitRelativePath: '',
      buffer,
    };
  });

  const results = extract(documents);

  for (const group of results) {
    for (const doc of group.documents) {
      if (doc.error) {
        errorLogs.push({
          timestamp: new Date(),
          filename: doc.name,
          mimeType: group.mimeType,
          error: doc.error,
          size: doc.size,
        });

        console.error(`[ERROR] ${doc.name}: ${doc.error}`);
      }
    }
  }

  return results;
}

// Log summary at the end
process.on('exit', () => {
  if (errorLogs.length > 0) {
    console.log('\n--- Error Summary ---');
    console.log(`Total errors: ${errorLogs.length}`);
    console.log('Errors by type:');

    const byType = errorLogs.reduce(
      (acc, log) => {
        acc[log.mimeType] = (acc[log.mimeType] || 0) + 1;
        return acc;
      },
      {} as Record<string, number>,
    );

    console.log(byType);
  }
});

Validation Before Extraction

Validate files before processing:

ts
import { extract } from 'undms';
import * as fs from 'fs';

interface ValidationResult {
  valid: boolean;
  error?: string;
}

function validateFile(filePath: string, mimeType: string): ValidationResult {
  try {
    const stats = fs.statSync(filePath);

    if (!stats.isFile()) {
      return { valid: false, error: 'Not a file' };
    }

    if (stats.size === 0) {
      return { valid: false, error: 'File is empty' };
    }

    if (stats.size > 100 * 1024 * 1024) {
      return { valid: false, error: 'File exceeds 100MB limit' };
    }

    const supportedTypes = [
      'text/plain',
      'application/pdf',
      'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
      'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
      'image/jpeg',
      'image/png',
    ];

    if (!supportedTypes.includes(mimeType)) {
      return { valid: false, error: `Unsupported MIME type: ${mimeType}` };
    }

    return { valid: true };
  } catch (e) {
    return {
      valid: false,
      error: e instanceof Error ? e.message : 'Unknown error',
    };
  }
}

function extractWithValidation(files: { path: string; type: string }[]) {
  const validFiles: typeof files = [];
  const invalidFiles: { path: string; error: string }[] = [];

  for (const file of files) {
    const validation = validateFile(file.path, file.type);

    if (validation.valid) {
      validFiles.push(file);
    } else {
      invalidFiles.push({ path: file.path, error: validation.error! });
    }
  }

  if (invalidFiles.length > 0) {
    console.warn('Skipped invalid files:');
    invalidFiles.forEach((f) => console.warn(`  ${f.path}: ${f.error}`));
  }

  if (validFiles.length === 0) {
    return [];
  }

  const documents = validFiles.map((f) => {
    const buffer = fs.readFileSync(f.path);
    const stats = fs.statSync(f.path);
    return {
      name: f.path,
      size: stats.size,
      type: f.type,
      lastModified: stats.mtimeMs,
      webkitRelativePath: '',
      buffer,
    };
  });

  return extract(documents);
}

Custom Error Types

Create custom error types for better error handling:

ts
import { extract } from 'undms';
import * as fs from 'fs';

class ExtractionError extends Error {
  constructor(
    message: string,
    public filename: string,
    public mimeType: string,
    public code?: string,
  ) {
    super(message);
    this.name = 'ExtractionError';
  }
}

class UnsupportedFormatError extends ExtractionError {
  constructor(filename: string, mimeType: string) {
    super(`Unsupported format: ${mimeType}`, filename, mimeType, 'UNSUPPORTED_FORMAT');
    this.name = 'UnsupportedFormatError';
  }
}

class CorruptedFileError extends ExtractionError {
  constructor(filename: string, originalError: string) {
    super(`File corrupted: ${originalError}`, filename, '', 'CORRUPTED_FILE');
    this.name = 'CorruptedFileError';
  }
}

function extractStrict(files: { path: string; type: string }[]) {
  return files.map((file) => {
    const buffer = fs.readFileSync(file.path);
    const stats = fs.statSync(file.path);

    if (!isSupported(file.type)) {
      throw new UnsupportedFormatError(file.path, file.type);
    }

    const result = extract([
      {
        name: file.path,
        size: stats.size,
        type: file.type,
        lastModified: stats.mtimeMs,
        webkitRelativePath: '',
        buffer,
      },
    ]);

    const doc = result[0]?.documents[0];

    if (doc?.error) {
      throw new CorruptedFileError(file.path, doc.error);
    }

    return doc;
  });
}

Global Error Handler

Set up global error handling for uncaught exceptions:

ts
import { extract } from 'undms';

process.on('uncaughtException', (error) => {
  console.error('Uncaught Exception:', error.message);
  console.error(error.stack);
  process.exit(1);
});

process.on('unhandledRejection', (reason) => {
  console.error('Unhandled Rejection:', reason);
  process.exit(1);
});

function safeExtract(documents: any[]) {
  try {
    return extract(documents);
  } catch (e) {
    console.error('Extraction failed:', e);
    return [];
  }
}

Released under the MIT License.