// parsers.jsx — Read uploaded files into plain text in the browser.
// Supports: .txt, .docx, .xlsx (bilingual), .tmx, .xliff/.xlf
// PDF parsing is delegated to the server in production (pdf.js is heavy).

async function readFileAsArrayBuffer(file) {
  return new Promise((resolve, reject) => {
    const r = new FileReader();
    r.onload = () => resolve(r.result);
    r.onerror = reject;
    r.readAsArrayBuffer(file);
  });
}
async function readFileAsText(file) {
  return new Promise((resolve, reject) => {
    const r = new FileReader();
    r.onload = () => resolve(r.result);
    r.onerror = reject;
    r.readAsText(file, 'utf-8');
  });
}

// Returns { text, kind, extra? }
//   kind: 'monolingual' | 'bilingual'
//   For bilingual files (xlsx/tmx/xlf), extra = { source: string, target: string }
async function parseFile(file) {
  const name = file.name.toLowerCase();
  const ext = name.split('.').pop();

  if (ext === 'txt' || ext === 'md') {
    const text = await readFileAsText(file);
    return { text, kind: 'monolingual' };
  }

  if (ext === 'docx') {
    const ab = await readFileAsArrayBuffer(file);
    const result = await window.mammoth.extractRawText({ arrayBuffer: ab });
    return { text: result.value, kind: 'monolingual' };
  }

  if (ext === 'xlsx') {
    const ab = await readFileAsArrayBuffer(file);
    const wb = window.XLSX.read(ab, { type: 'array' });
    // Heuristic: bilingual = first sheet has 2 visible text columns
    const sheet = wb.Sheets[wb.SheetNames[0]];
    const rows = window.XLSX.utils.sheet_to_json(sheet, { header: 1, defval: '' });
    // Find the two columns with the most text
    if (rows.length > 1 && rows[0].length >= 2) {
      const colCount = Math.max(...rows.map((r) => r.length));
      const colChars = new Array(colCount).fill(0);
      for (const row of rows.slice(1)) {
        for (let i = 0; i < colCount; i++) {
          if (typeof row[i] === 'string') colChars[i] += row[i].length;
        }
      }
      // Pick top 2 by character count
      const ranked = colChars.map((c, i) => [c, i]).sort((a, b) => b[0] - a[0]).slice(0, 2);
      if (ranked.length === 2 && ranked[0][0] > 100 && ranked[1][0] > 100) {
        const [iA, iB] = [ranked[0][1], ranked[1][1]].sort((a, b) => a - b);
        const srcRows = [], tgtRows = [];
        for (const row of rows.slice(1)) {
          if (row[iA] && row[iB]) {
            srcRows.push(String(row[iA]));
            tgtRows.push(String(row[iB]));
          }
        }
        return {
          text: srcRows.join('\n\n') + '\n\n---\n\n' + tgtRows.join('\n\n'),
          kind: 'bilingual',
          extra: { source: srcRows.join('\n\n'), target: tgtRows.join('\n\n') },
        };
      }
    }
    // Fallback: flatten the sheet
    const flat = window.XLSX.utils.sheet_to_csv(sheet);
    return { text: flat, kind: 'monolingual' };
  }

  if (ext === 'tmx') {
    const text = await readFileAsText(file);
    const doc = new DOMParser().parseFromString(text, 'application/xml');
    const tus = doc.querySelectorAll('tu');
    const srcRows = [], tgtRows = [];
    tus.forEach((tu) => {
      const tuvs = tu.querySelectorAll('tuv');
      if (tuvs.length >= 2) {
        const a = tuvs[0].querySelector('seg')?.textContent?.trim() || '';
        const b = tuvs[1].querySelector('seg')?.textContent?.trim() || '';
        if (a) srcRows.push(a);
        if (b) tgtRows.push(b);
      }
    });
    return {
      text: srcRows.join('\n\n') + '\n\n---\n\n' + tgtRows.join('\n\n'),
      kind: 'bilingual',
      extra: { source: srcRows.join('\n\n'), target: tgtRows.join('\n\n') },
    };
  }

  if (ext === 'xlf' || ext === 'xliff') {
    const text = await readFileAsText(file);
    const doc = new DOMParser().parseFromString(text, 'application/xml');
    // XLIFF 1.2 / 2.0 — try both
    const units = doc.querySelectorAll('trans-unit, unit');
    const srcRows = [], tgtRows = [];
    units.forEach((u) => {
      const s = u.querySelector('source')?.textContent?.trim() || '';
      const t = u.querySelector('target')?.textContent?.trim() || '';
      if (s) srcRows.push(s);
      if (t) tgtRows.push(t);
    });
    return {
      text: srcRows.join('\n\n') + '\n\n---\n\n' + tgtRows.join('\n\n'),
      kind: 'bilingual',
      extra: { source: srcRows.join('\n\n'), target: tgtRows.join('\n\n') },
    };
  }

  if (ext === 'pdf') {
    if (!window.__APE_USE_SERVER) {
      throw new Error('PDF parsing requires the server. Please upload .docx or .txt.');
    }
    const ab = await readFileAsArrayBuffer(file);
    const bytes = new Uint8Array(ab);
    let binary = '';
    for (let i = 0; i < bytes.length; i += 8192) {
      binary += String.fromCharCode.apply(null, bytes.subarray(i, i + 8192));
    }
    const b64 = btoa(binary);
    const res = await fetch('/api/parse-pdf', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      credentials: 'include',
      body: JSON.stringify({ data: b64 }),
    });
    const body = await res.json();
    if (!res.ok) throw new Error(body.error || 'PDF parsing failed');
    return { text: body.text, kind: 'monolingual' };
  }

  throw new Error(`Unsupported file type: .${ext}`);
}

// Naive byte size formatter
function formatSize(bytes) {
  if (bytes < 1024) return bytes + ' B';
  if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(1) + ' KB';
  return (bytes / 1024 / 1024).toFixed(1) + ' MB';
}

// Word count — splits on whitespace AND CJK characters as individual words
function wordCount(text) {
  if (!text) return 0;
  // Split alphabetic runs by whitespace, count CJK chars as words
  const cjkChars = (text.match(/[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]/g) || []).length;
  const nonCjk = text.replace(/[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]/g, ' ');
  const tokens = nonCjk.split(/\s+/).filter((t) => /[\p{L}\p{N}]/u.test(t)).length;
  return tokens + cjkChars;
}

Object.assign(window, { parseFile, formatSize, wordCount });
