\n`, warnings };
}
async function convertDocx(arrayBuffer) {
let result;
try {
result = await mammoth.convertToHtml({ arrayBuffer });
} catch (err) {
throw new Error('docx: ' + (err && err.message ? err.message : String(err)));
}
const raw = (result.value || '').trim();
if (!raw) throw new Error('docx: produced empty document — input may be corrupt or empty');
const { html, skipped } = sanitizeMammothUrls(raw);
const warnings = [
...skipped.map(s => `docx: ${s}`),
...(result.messages || []).map(m => `docx: ${m.message}`),
];
return { html: `\n${html}\n`, warnings };
}
// Mammoth doesn't filter URL schemes — a docx with a `javascript:` hyperlink
// would land in the imported document and execute on click (stored XSS in the
// downloaded rwa container). Mirror of cli/src/import.mjs sanitizeMammothUrls.
const _SAFE_HREF_SCHEMES = new Set(['http', 'https', 'mailto', 'tel']);
// Two layers, both required:
// 1) Strip invisibles before parsing — whitespace + C0/C1 controls (\x00-\x1f,
// \x7f-\xa0) + soft hyphen (\xad) + Cf-class format chars (ZWSP/ZWNJ/ZWJ,
// LRM/RLM, LRE/RLE/PDF/LRO/RLO, word joiner, BOM, etc.). The previous
// regex used JS \s which doesn't match these — they slipped through and
// let a docx with `javascript:…` href bypass the scheme check.
// 2) Parse via WHATWG URL — the same parser the browser uses to navigate.
// Resolve against a synthetic base so scheme-less inputs (relative URL,
// fragment, path) round-trip back to that base and pass.
const _ATTR_STRIP_RE = /[\s\x00-\x1f\x7f-\xa0\xad---]/g;
const _SANITIZER_BASE = 'http://_rwa_sanitizer_base_/';
function _attrIsSafe(attr, val) {
const normalized = String(val).replace(_ATTR_STRIP_RE, '');
let parsed;
try { parsed = new URL(normalized, _SANITIZER_BASE); }
catch { return true; } // unparseable → cannot be an active URL scheme
if (parsed.origin === 'http://_rwa_sanitizer_base_') return true; // resolved relative — no scheme present
const proto = parsed.protocol.replace(/:$/, '').toLowerCase();
if (_SAFE_HREF_SCHEMES.has(proto)) return true;
// Mammoth embeds raster images as data:image/...;base64,... — allow on src.
// data:image/svg+xml passes here too, but renders SVG in image-
// loading mode with no script execution (HTML spec), so the narrow
// 'data:image/*' allowance is still safe for src. Keep scoped to src only.
if (attr === 'src' && proto === 'data' && /^data:image\//i.test(parsed.href)) return true;
return false;
}
function sanitizeMammothUrls(html) {
const skipped = [];
const stripAttr = (attr) => (full, val) => {
if (_attrIsSafe(attr, val)) return full;
const m = val.match(/^\s*([a-z][a-z0-9+.\-]*):/i);
const scheme = m ? m[1].toLowerCase() : 'unknown';
skipped.push(`stripped unsafe ${attr} (scheme: ${scheme}:)`);
return `${attr}="#"`;
};
return {
html: html
.replace(/href="([^"]*)"/g, stripAttr('href'))
.replace(/src="([^"]*)"/g, stripAttr('src')),
skipped,
};
}
// marked v14 explicitly does NOT sanitize HTML — its README points readers at
// DOMPurify. The seed bootstrap injects INLINE_DOC via m.innerHTML AND
// re-creates