Skip to content

computeDocumentSimilarity

Extracts documents and computes similarity against reference texts.

Function Signature

ts
function computeDocumentSimilarity(
  documents: Document[],
  referenceTexts: string[],
  similarityThreshold?: number,
  similarityMethod?: string,
): GroupedDocumentsWithSimilarity[];

Parameters

ParameterTypeRequiredDefaultDescription
documentsDocument[]Yes-Documents to extract and compare
referenceTextsstring[]Yes-Candidate reference texts
similarityThresholdnumberNo30.0Minimum score (0-100) to include a match
similarityMethodstringNo'hybrid'One of: 'jaccard', 'ngram', 'levenshtein', 'hybrid'

Returns

GroupedDocumentsWithSimilarity[] — Documents grouped by MIME type, each with similarity matches.

Example

Basic Usage

ts
import { computeDocumentSimilarity } from 'undms';

const result = computeDocumentSimilarity(
  [
    {
      name: 'document.txt',
      size: 100,
      type: 'text/plain',
      lastModified: Date.now(),
      webkitRelativePath: '',
      buffer: Buffer.from('hello world from undms'),
    },
  ],
  ['hello world from undms', 'different text'],
  90,
  'hybrid',
);

console.log(result[0].documents[0].similarityMatches);
// [{ referenceIndex: 0, similarityPercentage: 100 }]

Finding Similar Documents

ts
import { computeDocumentSimilarity } from 'undms';

const documents = [
  {
    name: 'doc1.txt',
    size: 100,
    type: 'text/plain',
    lastModified: Date.now(),
    webkitRelativePath: '',
    buffer: Buffer.from('The quick brown fox jumps over the lazy dog'),
  },
  {
    name: 'doc2.txt',
    size: 100,
    type: 'text/plain',
    lastModified: Date.now(),
    webkitRelativePath: '',
    buffer: Buffer.from('A quick brown fox jumps over a lazy dog'),
  },
  {
    name: 'doc3.txt',
    size: 100,
    type: 'text/plain',
    lastModified: Date.now(),
    webkitRelativePath: '',
    buffer: Buffer.from('Something completely different here'),
  },
];

const referenceTexts = [
  'The quick brown fox jumps over the lazy dog',
  'A lazy dog resting in the sun',
  'Completely unrelated content about cooking',
];

const results = computeDocumentSimilarity(documents, referenceTexts, 50, 'hybrid');

results.forEach((group) => {
  console.log(`\nMIME Type: ${group.mimeType}`);
  group.documents.forEach((doc) => {
    console.log(`Document: ${doc.name}`);
    console.log(`Content: ${doc.content}`);

    if (doc.similarityMatches.length > 0) {
      console.log('Similarity Matches:');
      doc.similarityMatches.forEach((match) => {
        console.log(
          `  - Reference ${match.referenceIndex} (${referenceTexts[match.referenceIndex]}): ` +
            `${match.similarityPercentage.toFixed(1)}%`,
        );
      });
    } else {
      console.log('No matches above threshold');
    }
  });
});

Using Different Methods

ts
import { computeDocumentSimilarity } from 'undms';

const document = {
  name: 'sample.txt',
  size: 500,
  type: 'text/plain',
  lastModified: Date.now(),
  webkitRelativePath: '',
  buffer: Buffer.from('machine learning is a subset of artificial intelligence'),
};

const references = [
  'machine learning is a subset of artificial intelligence',
  'deep learning is a specialized form of machine learning',
  'artificial intelligence encompasses machine learning techniques',
  'the weather is sunny today',
];

// Jaccard - word set comparison
const jaccardResults = computeDocumentSimilarity([document], references, 30, 'jaccard');

// N-gram - character trigram matching
const ngramResults = computeDocumentSimilarity([document], references, 30, 'ngram');

// Levenshtein - edit distance
const levenshteinResults = computeDocumentSimilarity([document], references, 30, 'levenshtein');

// Hybrid - combined (default)
const hybridResults = computeDocumentSimilarity([document], references, 30, 'hybrid');

console.log('Jaccard:', jaccardResults[0].documents[0].similarityMatches);
console.log('N-gram:', ngramResults[0].documents[0].similarityMatches);
console.log('Levenshtein:', levenshteinResults[0].documents[0].similarityMatches);
console.log('Hybrid:', hybridResults[0].documents[0].similarityMatches);

Plagiarism Detection

ts
import { computeDocumentSimilarity } from 'undms';

interface Submission {
  student: string;
  content: string;
}

const submissions: Submission[] = [
  { student: 'Alice', content: 'Machine learning enables computers to learn from data' },
  { student: 'Bob', content: 'Machine learning allows computers to learn from data' },
  { student: 'Charlie', content: 'Deep learning uses neural networks to process data' },
];

const referenceSources = [
  'Machine learning is a subset of artificial intelligence that enables computers to learn from data',
  'Deep learning is a specialized form of machine learning using neural networks',
];

const documents = submissions.map((s) => ({
  name: `${s.student}.txt`,
  size: s.content.length,
  type: 'text/plain',
  lastModified: Date.now(),
  webkitRelativePath: '',
  buffer: Buffer.from(s.content),
}));

const threshold = 60;
const results = computeDocumentSimilarity(documents, referenceSources, threshold, 'hybrid');

console.log(`Plagiarism Check (threshold: ${threshold}%)\n`);

results.forEach((group) => {
  group.documents.forEach((doc) => {
    const student = submissions.find((s) => `${s.student}.txt` === doc.name);
    console.log(`Student: ${student?.student}`);

    if (doc.similarityMatches.length > 0) {
      doc.similarityMatches.forEach((match) => {
        console.log(
          `  ⚠️  Potential match with source ${match.referenceIndex}: ` +
            `${match.similarityPercentage.toFixed(1)}%`,
        );
      });
    } else {
      console.log('  ✅ No significant matches found');
    }
    console.log();
  });
});

Working with PDFs

ts
import { computeDocumentSimilarity } from 'undms';
import * as fs from 'fs';

function checkPdfSimilarity(pdfPath: string, references: string[]) {
  const pdfBuffer = fs.readFileSync(pdfPath);

  const result = computeDocumentSimilarity(
    [
      {
        name: pdfPath,
        size: pdfBuffer.length,
        type: 'application/pdf',
        lastModified: Date.now(),
        webkitRelativePath: '',
        buffer: pdfBuffer,
      },
    ],
    references,
    40,
    'hybrid',
  );

  const doc = result[0].documents[0];

  return {
    content: doc.content,
    matches: doc.similarityMatches.map((m) => ({
      reference: references[m.referenceIndex],
      similarity: m.similarityPercentage,
    })),
  };
}

const references = [
  'Introduction to machine learning algorithms',
  'Advanced data structures and algorithms',
  'Web development best practices',
];

const result = checkPdfSimilarity('./research_paper.pdf', references);

console.log('Extracted Content Preview:');
console.log(result.content.substring(0, 200));
console.log('\nSimilarity Matches:');
result.matches.forEach((m) => {
  console.log(`  ${m.similarity.toFixed(1)}% - ${m.reference}`);
});

Adjusting Threshold

ts
import { computeDocumentSimilarity } from 'undms';

const document = {
  name: 'text.txt',
  size: 100,
  type: 'text/plain',
  lastModified: Date.now(),
  webkitRelativePath: '',
  buffer: Buffer.from('The quick brown fox jumps over the lazy dog'),
};

const references = [
  'The quick brown fox jumps over the lazy dog', // 100%
  'A quick brown fox jumps over a lazy dog', // ~85%
  'Quick brown fox lazy dog', // ~70%
  'The cat sat on the mat', // ~20%
];

// High threshold - only exact matches
const strictResults = computeDocumentSimilarity([document], references, 90, 'hybrid');

// Medium threshold - close variations
const moderateResults = computeDocumentSimilarity([document], references, 60, 'hybrid');

// Low threshold - any similarity
const lenientResults = computeDocumentSimilarity([document], references, 20, 'hybrid');

console.log('Strict (90%):', strictResults[0].documents[0].similarityMatches.length, 'matches');
console.log('Moderate (60%):', moderateResults[0].documents[0].similarityMatches.length, 'matches');
console.log('Lenient (20%):', lenientResults[0].documents[0].similarityMatches.length, 'matches');

Error Handling

ts
import { computeDocumentSimilarity } from 'undms';

const documents = [
  {
    name: 'corrupted.docx',
    size: 100,
    type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
    lastModified: Date.now(),
    webkitRelativePath: '',
    buffer: Buffer.from('not a real docx'),
  },
];

const results = computeDocumentSimilarity(documents, ['reference text'], 50, 'hybrid');

const doc = results[0].documents[0];

// Handle extraction errors
if (doc.error) {
  console.log('Extraction error:', doc.error);
  console.log('Content may be incomplete or empty');
}

// Similarity still computed on available content
console.log('Similarity matches:', doc.similarityMatches);

Released under the MIT License.