confluent/ConfluentTranslator/src/core/translation/promptBuilder.js

/**
 * Prompt Builder - Génère des prompts contextuels optimisés
 *
 * Fonctionnalités:
 * 1. Templates de base (règles linguistiques sans lexique massif)
 * 2. Injection de vocabulaire ciblé
 * 3. Fallback racines
 * 4. Formatage optimisé pour le LLM
 */

const fs = require('fs');
const path = require('path');
const { preprocessNumbers } = require('../numbers/numberPreprocessor');

/**
 * Charge le template de prompt de base depuis les fichiers
 * @param {string} variant - 'proto' ou 'ancien'
 * @returns {string} - Template de prompt
 */
function loadBaseTemplate(variant) {
  const templatePath = path.join(__dirname, '..', '..', '..', 'prompts', `${variant}-system.txt`);

  if (!fs.existsSync(templatePath)) {
    throw new Error(`Template not found: ${templatePath}`);
  }

  return fs.readFileSync(templatePath, 'utf-8');
}

/**
 * Génère la section vocabulaire pour le prompt
 * Format compact et structuré
 * @param {Array} entries - Entrées du lexique pertinentes
 * @returns {string} - Section vocabulaire formatée
 */
function formatVocabularySection(entries) {
  if (!entries || entries.length === 0) {
    return '';
  }

  const lines = ['\n# VOCABULAIRE PERTINENT POUR CETTE TRADUCTION\n'];

  // Grouper par type
  const byType = {
    racine_sacree: [],
    racine: [],
    verbe: [],
    nom: [],
    autre: []
  };

  // Map pour dédupliquer par mot confluent
  const deduplicationMap = new Map();

  entries.forEach(entry => {
    if (entry.traductions && entry.traductions.length > 0) {
      // Traiter TOUTES les traductions (pas seulement la première)
      entry.traductions.forEach(trad => {
        const confKey = trad.confluent;

        // Si cette traduction existe déjà, fusionner les infos françaises
        if (deduplicationMap.has(confKey)) {
          const existing = deduplicationMap.get(confKey);
          // Ajouter le mot français s'il n'est pas déjà présent (lowercase pour comparaison)
          const motLower = entry.mot_francais.toLowerCase();
          if (!existing.fr_variants.some(v => v.toLowerCase() === motLower)) {
            existing.fr_variants.push(entry.mot_francais);
          }
          // Fusionner les synonymes (éviter doublons case-insensitive)
          if (entry.synonymes_fr) {
            entry.synonymes_fr.forEach(syn => {
              const synLower = syn.toLowerCase();
              if (!existing.synonymes.some(s => s.toLowerCase() === synLower)) {
                existing.synonymes.push(syn);
              }
            });
          }
        } else {
          const type = trad.type || 'autre';
          const key = type === 'racine_sacree' ? 'racine_sacree' :
                      type === 'racine' ? 'racine' :
                      type.includes('verbe') ? 'verbe' : // verbe, verbe_irregulier
                      type === 'nom' ? 'nom' : 'autre';

          deduplicationMap.set(confKey, {
            fr_variants: [entry.mot_francais],
            conf: trad.confluent,
            forme_liee: trad.forme_liee || trad.confluent,
            domaine: trad.domaine || '',
            note: trad.note || '',
            type: trad.type || '',
            synonymes: [...(entry.synonymes_fr || [])],
            typeKey: key
          });
        }
      });
    }
  });

  // Réorganiser par type
  deduplicationMap.forEach(item => {
    byType[item.typeKey].push(item);
  });

  // Formatter par type
  if (byType.racine_sacree.length > 0) {
    lines.push('## Racines sacrées (voyelle initiale)\n');
    byType.racine_sacree.forEach(item => {
      // Combiner et dédupliquer fr_variants et synonymes
      const allFrench = [...new Set([...item.fr_variants, ...item.synonymes])];
      let line = `- ${item.conf} (${allFrench.join(', ')}) [forme liée: ${item.forme_liee}]`;
      if (item.note) line += ` - ${item.note}`;
      lines.push(line);
    });
    lines.push('');
  }

  if (byType.racine.length > 0) {
    lines.push('## Racines standards\n');
    byType.racine.forEach(item => {
      const allFrench = [...new Set([...item.fr_variants, ...item.synonymes])];
      let line = `- ${item.conf} (${allFrench.join(', ')}) [forme liée: ${item.forme_liee}]`;
      if (item.note) line += ` - ${item.note}`;
      lines.push(line);
    });
    lines.push('');
  }

  if (byType.verbe.length > 0) {
    lines.push('## Verbes\n');
    byType.verbe.forEach(item => {
      const allFrench = [...new Set([...item.fr_variants, ...item.synonymes])];
      let line = `- ${allFrench.join(', ')} → ${item.conf}`;
      if (item.type) line += ` [${item.type}]`;
      if (item.note) line += ` - ${item.note}`;
      lines.push(line);
    });
    lines.push('');
  }

  if (byType.nom.length > 0) {
    lines.push('## Noms et concepts\n');
    byType.nom.forEach(item => {
      const allFrench = [...new Set([...item.fr_variants, ...item.synonymes])];
      let line = `- ${allFrench.join(', ')} → ${item.conf}`;
      lines.push(line);
    });
    lines.push('');
  }

  if (byType.autre.length > 0) {
    lines.push('## Autres\n');
    byType.autre.forEach(item => {
      const allFrench = [...new Set([...item.fr_variants, ...item.synonymes])];
      let line = `- ${allFrench.join(', ')} → ${item.conf}`;
      lines.push(line);
    });
    lines.push('');
  }

  return lines.join('\n');
}

/**
 * Génère la section de fallback avec toutes les racines
 * @param {Array} roots - Liste des racines
 * @returns {string} - Section racines formatée
 */
function formatRootsFallback(roots) {
  if (!roots || roots.length === 0) {
    return '';
  }

  const lines = ['\n# RACINES DISPONIBLES (à composer)\n'];
  lines.push('⚠️  Les mots demandés ne sont pas dans le lexique. Compose-les à partir des racines ci-dessous.\n');

  const sacrees = roots.filter(r => r.sacree);
  const standards = roots.filter(r => !r.sacree);

  if (sacrees.length > 0) {
    lines.push(`## Racines sacrées (${sacrees.length})\n`);
    sacrees.forEach(r => {
      lines.push(`- ${r.confluent} (${r.mot_francais}) [forme liée: ${r.forme_liee}] - ${r.domaine}`);
    });
    lines.push('');
  }

  if (standards.length > 0) {
    lines.push(`## Racines standards (${standards.length})\n`);
    standards.forEach(r => {
      lines.push(`- ${r.confluent} (${r.mot_francais}) [forme liée: ${r.forme_liee}] - ${r.domaine}`);
    });
    lines.push('');
  }

  lines.push('IMPORTANT: Utilise les liaisons sacrées pour composer les mots manquants.\n');

  return lines.join('\n');
}

/**
 * Construit un prompt contextuel complet
 * @param {Object} contextResult - Résultat de analyzeContext()
 * @param {string} variant - 'proto' ou 'ancien'
 * @returns {string} - Prompt complet optimisé
 */
function buildContextualPrompt(contextResult, variant = 'ancien', originalText = '') {
  // Charger le template de base
  const basePrompt = loadBaseTemplate(variant);

  // NOUVEAU: Preprocessing des nombres
  let numbersSection = '';
  if (originalText) {
    const numberInfo = preprocessNumbers(originalText);
    if (numberInfo.hasNumbers && numberInfo.promptSection) {
      numbersSection = numberInfo.promptSection;
    }
  }

  // TOUJOURS injecter les racines (nécessaires pour composition)
  const rootsSection = contextResult.rootsFallback && contextResult.rootsFallback.length > 0
    ? formatRootsFallback(contextResult.rootsFallback)
    : '';

  // Si fallback, injecter UNIQUEMENT les racines (pas de vocabulaire)
  if (contextResult.useFallback) {
    return basePrompt + '\n' + numbersSection + '\n' + rootsSection;
  }

  // Sinon, injecter vocabulaire pertinent + racines
  const vocabularySection = formatVocabularySection(contextResult.entries);
  return basePrompt + '\n' + numbersSection + '\n' + vocabularySection + '\n' + rootsSection;
}

/**
 * Construit le prompt de base sans aucun lexique (pour useLexique=false)
 * @param {string} variant - 'proto' ou 'ancien'
 * @returns {string} - Prompt de base uniquement
 */
function getBasePrompt(variant = 'ancien') {
  return loadBaseTemplate(variant);
}

/**
 * Estime le nombre de tokens dans un texte
 * Estimation simple : ~1 token pour 4 caractères
 * @param {string} text - Texte à estimer
 * @returns {number} - Nombre de tokens estimé
 */
function estimateTokens(text) {
  return Math.ceil(text.length / 4);
}

/**
 * Génère des statistiques sur le prompt généré
 * @param {string} prompt - Prompt généré
 * @param {Object} contextResult - Résultat du contexte
 * @returns {Object} - Statistiques
 */
function getPromptStats(prompt, contextResult) {
  const promptTokens = estimateTokens(prompt);
  const fullLexiqueTokens = contextResult.metadata.tokensFullLexique;
  const saved = fullLexiqueTokens - promptTokens;
  const savingsPercent = Math.round((saved / fullLexiqueTokens) * 100);

  return {
    promptTokens,
    fullLexiqueTokens,
    tokensSaved: saved,
    savingsPercent,
    entriesUsed: contextResult.metadata.entriesUsed,
    useFallback: contextResult.useFallback,
    wordsFound: contextResult.metadata.wordsFound.length,
    wordsNotFound: contextResult.metadata.wordsNotFound.length
  };
}

module.exports = {
  loadBaseTemplate,
  formatVocabularySection,
  formatRootsFallback,
  buildContextualPrompt,
  getBasePrompt,
  estimateTokens,
  getPromptStats
};