import a file

drop a .md, .csv, .docx, or .pdf file here, or choose one

… block, so a filename containing // would close the script tag early and turn the rest into HTML. Matches // escapeTL's s.replace(/\\/g, '\\\\').replace(/'/g, "\\'").replace(/<\/script/gi, '<\\/script'); const canonLF = s => s == null ? '' : String(s).replace(/\r\n/g, '\n').replace(/\r/g, '\n'); const escapeTL = s => canonLF(s) .replace(/\\/g, '\\\\') .replace(/`/g, '\\`') .replace(/\$\{/g, '\\${') .replace(/<\/script/gi, '<\\/script'); function titleFromBasename(basename) { return basename .replace(/[-_]+/g, ' ') .split(' ') .filter(Boolean) .map(w => w[0].toUpperCase() + w.slice(1)) .join(' ') || 'Untitled'; } function applyTitleAndFileSubs(seed, { title, fileMeta }) { for (const { re, label } of [ { re: TITLE_RE, label: '' }, { re: FILE_RE, label: 'FILE:' }, ]) { const matches = seed.match(new RegExp(re.source, 'g')) || []; if (matches.length !== 1) { throw new Error(`seed must contain exactly one ${label} line, found ${matches.length}`); } } let out = seed.replace(TITLE_RE, `<title>${escapeHtml(title)}`); out = out.replace(FILE_RE, (_m, prefix) => `${prefix}'${escapeJsString(fileMeta)}'`); return out; } function replaceInlineDoc(seed, newDoc) { const start = seed.indexOf(INLINE_DOC_MARKER); if (start < 0) throw new Error('cannot locate INLINE_DOC marker in seed'); const cs = start + INLINE_DOC_MARKER.length; let i = cs; while (i < seed.length) { if (seed[i] === '\\') { i += 2; continue; } if (seed[i] === '`') break; i++; } if (i >= seed.length) throw new Error('unterminated INLINE_DOC literal in seed'); return seed.slice(0, cs) + escapeTL(newDoc) + seed.slice(i); } function looksLikeCsv(text) { const probe = Papa.parse(text, { preview: 2, skipEmptyLines: true, header: false }); if (probe.errors.length > 0) return false; if (probe.data.length === 0) return false; const cols = probe.data[0].length; if (cols < 2) return false; if (probe.data.length === 2 && probe.data[1].length !== cols) return false; return true; } function convertCsv(text) { if (!looksLikeCsv(text)) { throw new Error('csv probe failed: input does not look like CSV (need ≥2 columns with consistent column count)'); } const result = Papa.parse(text, { skipEmptyLines: true, header: false }); // Errors are kept as warnings; Papa is lenient and produces partial data, // matching the CLI's behavior where parse errors don't abort the import. const warnings = result.errors.map(e => { const where = e.row != null ? ` (row ${e.row + 1})` : ''; return `csv parse: ${e.message}${where}`; }); const rows = result.data; if (rows.length === 0) { return { html: '

', warnings }; } const escape = s => String(s == null ? '' : s).replace(/&/g, '&').replace(//g, '>'); const [header, ...body] = rows; const thead = `\n${header.map(c => `${escape(c)}`).join('')}\n`; const tbody = body.length === 0 ? '' : `\n\n${body.map(row => `${row.map(c => `${escape(c)}`).join('')}`).join('\n')}\n`; return { html: `

\n\n${thead}${tbody}\n

`, warnings }; } async function convertDocx(arrayBuffer) { let result; try { result = await mammoth.convertToHtml({ arrayBuffer }); } catch (err) { throw new Error('docx: ' + (err && err.message ? err.message : String(err))); } const raw = (result.value || '').trim(); if (!raw) throw new Error('docx: produced empty document — input may be corrupt or empty'); const { html, skipped } = sanitizeMammothUrls(raw); const warnings = [ ...skipped.map(s => `docx: ${s}`), ...(result.messages || []).map(m => `docx: ${m.message}`), ]; return { html: `

\n${html}\n

`, warnings }; } // Mammoth doesn't filter URL schemes — a docx with a `javascript:` hyperlink // would land in the imported document and execute on click (stored XSS in the // downloaded rwa container). Mirror of cli/src/import.mjs sanitizeMammothUrls. const _SAFE_HREF_SCHEMES = new Set(['http', 'https', 'mailto', 'tel']); // Two layers, both required: // 1) Strip invisibles before parsing — whitespace + C0/C1 controls (\x00-\x1f, // \x7f-\xa0) + soft hyphen (\xad) + Cf-class format chars (ZWSP/ZWNJ/ZWJ, // LRM/RLM, LRE/RLE/PDF/LRO/RLO, word joiner, BOM, etc.). The previous // regex used JS \s which doesn't match these — they slipped through and // let a docx with `javascript:…` href bypass the scheme check. // 2) Parse via WHATWG URL — the same parser the browser uses to navigate. // Resolve against a synthetic base so scheme-less inputs (relative URL, // fragment, path) round-trip back to that base and pass. const _ATTR_STRIP_RE = /[\s\x00-\x1f\x7f-\xa0\xad؜᠎-‏‪-‮⁠-⁯]/g; const _SANITIZER_BASE = 'http://_rwa_sanitizer_base_/'; function _attrIsSafe(attr, val) { const normalized = String(val).replace(_ATTR_STRIP_RE, ''); let parsed; try { parsed = new URL(normalized, _SANITIZER_BASE); } catch { return true; } // unparseable → cannot be an active URL scheme if (parsed.origin === 'http://_rwa_sanitizer_base_') return true; // resolved relative — no scheme present const proto = parsed.protocol.replace(/:$/, '').toLowerCase(); if (_SAFE_HREF_SCHEMES.has(proto)) return true; // Mammoth embeds raster images as data:image/...;base64,... — allow on src. // data:image/svg+xml passes here too, but renders SVG in image- // loading mode with no script execution (HTML spec), so the narrow // 'data:image/*' allowance is still safe for src. Keep scoped to src only. if (attr === 'src' && proto === 'data' && /^data:image\//i.test(parsed.href)) return true; return false; } function sanitizeMammothUrls(html) { const skipped = []; const stripAttr = (attr) => (full, val) => { if (_attrIsSafe(attr, val)) return full; const m = val.match(/^\s*([a-z][a-z0-9+.\-]*):/i); const scheme = m ? m[1].toLowerCase() : 'unknown'; skipped.push(`stripped unsafe ${attr} (scheme: ${scheme}:)`); return `${attr}="#"`; }; return { html: html .replace(/href="([^"]*)"/g, stripAttr('href')) .replace(/src="([^"]*)"/g, stripAttr('src')), skipped, }; } // marked v14 explicitly does NOT sanitize HTML — its README points readers at // DOMPurify. The seed bootstrap injects INLINE_DOC via m.innerHTML AND // re-creates