How to scan PDF uploads for malware in Node.js
PDFs look like boring document files, but the PDF specification supports embedded JavaScript, embedded executables, links that can trigger actions in PDF viewers, and complex font and stream parsing that has historically been a rich source of exploits.
If your application accepts PDF uploads — invoices, contracts, CVs, reports — and then serves them to other users or opens them programmatically, you need to scan them. This guide shows how to do that with pompelmi and ClamAV.
What ClamAV catches in PDFs
ClamAV includes signatures for PDF-based threats including:
- PDFs with embedded JavaScript that calls
app.launch()orthis.exportDataObject() - PDFs that embed executable files (EXE, DLL) in their data streams
- PDFs exploiting known vulnerabilities in Acrobat Reader and Foxit
- PDFs used as dropper vehicles for ransomware and trojans
- Polyglot files — files that are valid both as PDFs and as another executable format
pompelmi passes the file directly to clamscan, which handles
all of this automatically. You do not need to parse PDF structure yourself.
Install
npm install pompelmi multer express
ClamAV must be installed on the host system. See How to install ClamAV on macOS, Linux and Windows if you haven't done that yet.
Scanning a PDF file
The pompelmi API is the same for every file type. Pass the absolute path
to scan():
const { scan, Verdict } = require('pompelmi');
const path = require('path');
async function scanPdf(filePath) {
const abs = path.resolve(filePath);
const verdict = await scan(abs);
switch (verdict) {
case Verdict.Clean:
return { safe: true };
case Verdict.Malicious:
return { safe: false, reason: 'Malware detected in PDF.' };
case Verdict.ScanError:
// A scan error on a PDF often means a corrupted or truncated file.
// Treat as untrusted — do not serve or process it.
return { safe: false, reason: 'Scan could not complete.' };
}
}
scanPdf('./uploads/contract.pdf').then(console.log);
Express endpoint for PDF uploads
const express = require('express');
const multer = require('multer');
const { scan, Verdict } = require('pompelmi');
const path = require('path');
const fs = require('fs');
const os = require('os');
const app = express();
const upload = multer({
dest: os.tmpdir(),
limits: { fileSize: 20 * 1024 * 1024 } // 20 MB limit
});
app.post('/upload-pdf', upload.single('pdf'), async (req, res) => {
if (!req.file) {
return res.status(400).json({ error: 'No file provided.' });
}
// Reject non-PDFs based on MIME type (client-supplied, but useful as a quick filter)
if (req.file.mimetype !== 'application/pdf') {
fs.unlinkSync(req.file.path);
return res.status(415).json({ error: 'Only PDF files are accepted.' });
}
const tmpPath = req.file.path;
let promoted = false;
try {
const verdict = await scan(tmpPath);
if (verdict === Verdict.Malicious) {
return res.status(400).json({ error: 'Malware detected in PDF. Upload rejected.' });
}
if (verdict === Verdict.ScanError) {
return res.status(422).json({
error: 'PDF could not be scanned. The file may be corrupted.'
});
}
// Clean — move to storage
const dest = path.join('/var/app/pdfs', req.file.originalname);
fs.renameSync(tmpPath, dest);
promoted = true;
res.json({ status: 'ok', file: req.file.originalname });
} catch (err) {
res.status(500).json({ error: err.message });
} finally {
if (!promoted && fs.existsSync(tmpPath)) {
fs.unlinkSync(tmpPath);
}
}
});
Validating the PDF format before scanning
A file can be named .pdf without actually being a PDF. Validate
the file's magic bytes before scanning to catch obvious spoofs early. The PDF
format starts with the bytes 25 50 44 46 (%PDF).
const fs = require('fs');
/**
* Returns true if the file starts with the PDF magic bytes (%PDF).
* This is a fast, synchronous check — not a full format validation.
*/
function hasPdfMagicBytes(filePath) {
const buf = Buffer.alloc(4);
const fd = fs.openSync(filePath, 'r');
fs.readSync(fd, buf, 0, 4, 0);
fs.closeSync(fd);
return buf.toString('ascii') === '%PDF';
}
// Use before scanning
if (!hasPdfMagicBytes(tmpPath)) {
fs.unlinkSync(tmpPath);
return res.status(415).json({ error: 'File does not appear to be a valid PDF.' });
}
Serving scanned PDFs safely
Even a clean PDF should be served with appropriate HTTP headers. A PDF containing JavaScript cannot execute on your server, but it can execute in the user's browser if rendered inline by the PDF viewer plugin.
app.get('/pdfs/:filename', (req, res) => {
// Strip path components to prevent directory traversal
const safeName = path.basename(req.params.filename);
const filePath = path.join('/var/app/pdfs', safeName);
if (!fs.existsSync(filePath)) {
return res.status(404).json({ error: 'Not found.' });
}
// Force download — do not render inline in the browser
res.setHeader('Content-Type', 'application/pdf');
res.setHeader('Content-Disposition', `attachment; filename="${safeName}"`);
// Prevent the browser from sniffing the content type
res.setHeader('X-Content-Type-Options', 'nosniff');
res.sendFile(filePath);
});
For more on multi-file upload patterns see Scanning multipart uploads with pompelmi and Multer.