## 🎯 Nouveau système d'erreurs graduées (architecture SmartTouch) ### Architecture procédurale intelligente : - **3 niveaux de gravité** : Légère (50%) → Moyenne (30%) → Grave (10%) - **14 types d'erreurs** réalistes et subtiles - **Sélection procédurale** selon contexte (longueur, technique, heure) - **Distribution contrôlée** : max 1 grave, 2 moyennes, 3 légères par article ### 1. Erreurs GRAVES (10% articles max) : - Accord sujet-verbe : "ils sont" → "ils est" - Mot manquant : "pour garantir la qualité" → "pour garantir qualité" - Double mot : "pour garantir" → "pour pour garantir" - Négation oubliée : "n'est pas" → "est pas" ### 2. Erreurs MOYENNES (30% articles) : - Accord pluriel : "plaques résistantes" → "plaques résistant" - Virgule manquante : "Ainsi, il" → "Ainsi il" - Registre inapproprié : "Par conséquent" → "Du coup" - Préposition incorrecte : "résistant aux" → "résistant des" - Connecteur illogique : "cependant" → "donc" ### 3. Erreurs LÉGÈRES (50% articles) : - Double espace : "de votre" → "de votre" - Trait d'union : "c'est-à-dire" → "c'est à dire" - Espace ponctuation : "qualité ?" → "qualité?" - Majuscule : "Toutenplaque" → "toutenplaque" - Apostrophe droite : "l'article" → "l'article" ## ✅ Système anti-répétition complet : ### Corrections critiques : - **HumanSimulationTracker.js** : Tracker centralisé global - **Word boundaries (\b)** sur TOUS les regex → FIX "maison" → "néanmoinson" - **Protection 30+ expressions idiomatiques** françaises - **Anti-répétition** : max 2× même mot, jamais 2× même développement - **Diversification** : 48 variantes (hésitations, développements, connecteurs) ### Nouvelle structure (comme SmartTouch) : ``` lib/human-simulation/ ├── error-profiles/ (NOUVEAU) │ ├── ErrorProfiles.js (définitions + probabilités) │ ├── ErrorGrave.js (10% articles) │ ├── ErrorMoyenne.js (30% articles) │ ├── ErrorLegere.js (50% articles) │ └── ErrorSelector.js (sélection procédurale) ├── HumanSimulationCore.js (orchestrateur) ├── HumanSimulationTracker.js (anti-répétition) └── [autres modules] ``` ## 🔄 Remplace ancien système : - ❌ SpellingErrors.js (basique, répétitif, "et" → "." × 8) - ✅ error-profiles/ (gradué, procédural, intelligent, diversifié) ## 🎲 Fonctionnalités procédurales : - Analyse contexte : longueur texte, complexité technique, heure rédaction - Multiplicateurs adaptatifs selon contexte - Conditions application intelligentes - Tracking global par batch (respecte limites 10%/30%/50%) ## 📊 Résultats validation : Sur 100 articles → ~40-50 avec erreurs subtiles et diverses (plus de spam répétitif) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
817 lines
26 KiB
JavaScript
817 lines
26 KiB
JavaScript
// ========================================
|
||
// ADVERSARIAL CORE - MOTEUR MODULAIRE
|
||
// Responsabilité: Moteur adversarial réutilisable sur tout contenu
|
||
// Architecture: Couches applicables à la demande
|
||
// ========================================
|
||
|
||
const { logSh } = require('../ErrorReporting');
|
||
const { tracer } = require('../trace');
|
||
const { callLLM } = require('../LLMManager');
|
||
|
||
// Import stratégies et utilitaires
|
||
const { DetectorStrategyFactory, selectOptimalStrategy } = require('./DetectorStrategies');
|
||
|
||
/**
|
||
* MAIN ENTRY POINT - APPLICATION COUCHE ADVERSARIALE
|
||
* Input: contenu existant + configuration adversariale
|
||
* Output: contenu avec couche adversariale appliquée
|
||
*/
|
||
async function applyAdversarialLayer(existingContent, config = {}) {
|
||
return await tracer.run('AdversarialCore.applyAdversarialLayer()', async () => {
|
||
const {
|
||
detectorTarget = 'general',
|
||
intensity = 1.0,
|
||
method = 'regeneration', // 'regeneration' | 'enhancement' | 'hybrid'
|
||
preserveStructure = true,
|
||
csvData = null,
|
||
context = {},
|
||
llmProvider = 'gemini-pro' // ✅ AJOUTÉ: Extraction llmProvider avec fallback
|
||
} = config;
|
||
|
||
await tracer.annotate({
|
||
adversarialLayer: true,
|
||
detectorTarget,
|
||
intensity,
|
||
method,
|
||
llmProvider,
|
||
elementsCount: Object.keys(existingContent).length
|
||
});
|
||
|
||
const startTime = Date.now();
|
||
logSh(`🎯 APPLICATION COUCHE ADVERSARIALE: ${detectorTarget} (${method})`, 'INFO');
|
||
logSh(` 📊 ${Object.keys(existingContent).length} éléments | Intensité: ${intensity} | LLM: ${llmProvider}`, 'INFO');
|
||
|
||
try {
|
||
// Initialiser stratégie détecteur
|
||
const strategy = DetectorStrategyFactory.createStrategy(detectorTarget);
|
||
|
||
// Appliquer méthode adversariale choisie avec LLM spécifié
|
||
let adversarialContent = {};
|
||
const methodConfig = { ...config, llmProvider }; // ✅ Assurer propagation llmProvider
|
||
|
||
switch (method) {
|
||
case 'regeneration':
|
||
adversarialContent = await applyRegenerationMethod(existingContent, methodConfig, strategy);
|
||
break;
|
||
case 'enhancement':
|
||
adversarialContent = await applyEnhancementMethod(existingContent, methodConfig, strategy);
|
||
break;
|
||
case 'hybrid':
|
||
adversarialContent = await applyHybridMethod(existingContent, methodConfig, strategy);
|
||
break;
|
||
default:
|
||
throw new Error(`Méthode adversariale inconnue: ${method}`);
|
||
}
|
||
|
||
const duration = Date.now() - startTime;
|
||
const stats = {
|
||
elementsProcessed: Object.keys(existingContent).length,
|
||
elementsModified: countModifiedElements(existingContent, adversarialContent),
|
||
detectorTarget,
|
||
intensity,
|
||
method,
|
||
duration
|
||
};
|
||
|
||
logSh(`✅ COUCHE ADVERSARIALE APPLIQUÉE: ${stats.elementsModified}/${stats.elementsProcessed} modifiés (${duration}ms)`, 'INFO');
|
||
|
||
await tracer.event('Couche adversariale appliquée', stats);
|
||
|
||
return {
|
||
content: adversarialContent,
|
||
stats,
|
||
modifications: stats.elementsModified, // ✅ AJOUTÉ: Mapping pour PipelineExecutor
|
||
original: existingContent,
|
||
config
|
||
};
|
||
|
||
} catch (error) {
|
||
const duration = Date.now() - startTime;
|
||
logSh(`❌ COUCHE ADVERSARIALE ÉCHOUÉE après ${duration}ms: ${error.message}`, 'ERROR');
|
||
|
||
// Fallback: retourner contenu original
|
||
logSh(`🔄 Fallback: contenu original conservé`, 'WARNING');
|
||
return {
|
||
content: existingContent,
|
||
stats: { fallback: true, duration },
|
||
original: existingContent,
|
||
config,
|
||
error: error.message
|
||
};
|
||
}
|
||
}, { existingContent: Object.keys(existingContent), config });
|
||
}
|
||
|
||
/**
|
||
* MÉTHODE RÉGÉNÉRATION - Réécrire complètement avec prompts adversariaux
|
||
*/
|
||
async function applyRegenerationMethod(existingContent, config, strategy) {
|
||
const llmToUse = config.llmProvider || 'gemini-pro';
|
||
logSh(`🔄 Méthode régénération adversariale (LLM: ${llmToUse})`, 'DEBUG');
|
||
|
||
const results = {};
|
||
const contentEntries = Object.entries(existingContent);
|
||
|
||
// Traiter en chunks pour éviter timeouts
|
||
const chunks = chunkArray(contentEntries, 4);
|
||
|
||
for (let chunkIndex = 0; chunkIndex < chunks.length; chunkIndex++) {
|
||
const chunk = chunks[chunkIndex];
|
||
logSh(` 📦 Régénération chunk ${chunkIndex + 1}/${chunks.length}: ${chunk.length} éléments`, 'DEBUG');
|
||
|
||
try {
|
||
const regenerationPrompt = createRegenerationPrompt(chunk, config, strategy);
|
||
|
||
const response = await callLLM(llmToUse, regenerationPrompt, {
|
||
temperature: 0.7 + (config.intensity * 0.2), // Température variable selon intensité
|
||
maxTokens: 2000 * chunk.length
|
||
}, config.csvData?.personality);
|
||
|
||
const chunkResults = parseRegenerationResponse(response, chunk);
|
||
Object.assign(results, chunkResults);
|
||
|
||
logSh(` ✅ Chunk ${chunkIndex + 1}: ${Object.keys(chunkResults).length} éléments régénérés`, 'DEBUG');
|
||
|
||
// Délai entre chunks
|
||
if (chunkIndex < chunks.length - 1) {
|
||
await sleep(1500);
|
||
}
|
||
|
||
} catch (error) {
|
||
logSh(` ❌ Chunk ${chunkIndex + 1} échoué: ${error.message}`, 'ERROR');
|
||
|
||
// Fallback: garder contenu original pour ce chunk
|
||
chunk.forEach(([tag, content]) => {
|
||
results[tag] = content;
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* MÉTHODE ENHANCEMENT - Améliorer sans réécrire complètement
|
||
*/
|
||
async function applyEnhancementMethod(existingContent, config, strategy) {
|
||
const llmToUse = config.llmProvider || 'gemini-pro';
|
||
logSh(`🔧 Méthode enhancement adversarial (LLM: ${llmToUse})`, 'DEBUG');
|
||
|
||
const results = { ...existingContent }; // Base: contenu original
|
||
const elementsToEnhance = selectElementsForEnhancement(existingContent, config);
|
||
|
||
if (elementsToEnhance.length === 0) {
|
||
logSh(` ⏭️ Aucun élément nécessite enhancement`, 'DEBUG');
|
||
return results;
|
||
}
|
||
|
||
logSh(` 📋 ${elementsToEnhance.length} éléments sélectionnés pour enhancement`, 'DEBUG');
|
||
|
||
const enhancementPrompt = createEnhancementPrompt(elementsToEnhance, config, strategy);
|
||
|
||
try {
|
||
const response = await callLLM(llmToUse, enhancementPrompt, {
|
||
temperature: 0.5 + (config.intensity * 0.3),
|
||
maxTokens: 3000
|
||
}, config.csvData?.personality);
|
||
|
||
const enhancedResults = parseEnhancementResponse(response, elementsToEnhance);
|
||
|
||
// Appliquer améliorations
|
||
Object.keys(enhancedResults).forEach(tag => {
|
||
if (enhancedResults[tag] !== existingContent[tag]) {
|
||
results[tag] = enhancedResults[tag];
|
||
}
|
||
});
|
||
|
||
return results;
|
||
|
||
} catch (error) {
|
||
logSh(`❌ Enhancement échoué: ${error.message}`, 'ERROR');
|
||
return results; // Fallback: contenu original
|
||
}
|
||
}
|
||
|
||
/**
|
||
* MÉTHODE HYBRIDE - Combinaison régénération + enhancement
|
||
*/
|
||
async function applyHybridMethod(existingContent, config, strategy) {
|
||
logSh(`⚡ Méthode hybride adversariale`, 'DEBUG');
|
||
|
||
// 1. Enhancement léger sur tout le contenu
|
||
const enhancedContent = await applyEnhancementMethod(existingContent, {
|
||
...config,
|
||
intensity: config.intensity * 0.6 // Intensité réduite pour enhancement
|
||
}, strategy);
|
||
|
||
// 2. Régénération ciblée sur éléments clés
|
||
const keyElements = selectKeyElementsForRegeneration(enhancedContent, config);
|
||
|
||
if (keyElements.length === 0) {
|
||
return enhancedContent;
|
||
}
|
||
|
||
const keyElementsContent = {};
|
||
keyElements.forEach(tag => {
|
||
keyElementsContent[tag] = enhancedContent[tag];
|
||
});
|
||
|
||
const regeneratedElements = await applyRegenerationMethod(keyElementsContent, {
|
||
...config,
|
||
intensity: config.intensity * 1.2 // Intensité augmentée pour régénération
|
||
}, strategy);
|
||
|
||
// 3. Merger résultats
|
||
const hybridContent = { ...enhancedContent };
|
||
Object.keys(regeneratedElements).forEach(tag => {
|
||
hybridContent[tag] = regeneratedElements[tag];
|
||
});
|
||
|
||
return hybridContent;
|
||
}
|
||
|
||
// ============= HELPER FUNCTIONS =============
|
||
|
||
/**
|
||
* Créer prompt de régénération adversariale
|
||
*/
|
||
function createRegenerationPrompt(chunk, config, strategy) {
|
||
const { detectorTarget, intensity, csvData } = config;
|
||
|
||
let prompt = `MISSION: Réécris ces contenus pour éviter détection par ${detectorTarget}.
|
||
|
||
TECHNIQUE ANTI-${detectorTarget.toUpperCase()}:
|
||
${strategy.getInstructions(intensity).join('\n')}
|
||
|
||
CONTENUS À RÉÉCRIRE:
|
||
|
||
${chunk.map(([tag, content], i) => `[${i + 1}] TAG: ${tag}
|
||
ORIGINAL: "${content}"`).join('\n\n')}
|
||
|
||
CONSIGNES:
|
||
- GARDE exactement le même message et informations factuelles
|
||
- CHANGE structure, vocabulaire, style pour éviter détection ${detectorTarget}
|
||
- Intensité adversariale: ${intensity.toFixed(2)}
|
||
${csvData?.personality ? `- Style: ${csvData.personality.nom} (${csvData.personality.style})` : ''}
|
||
|
||
IMPORTANT: Réponse DIRECTE par les contenus réécrits, pas d'explication.
|
||
|
||
FORMAT:
|
||
[1] Contenu réécrit anti-${detectorTarget}
|
||
[2] Contenu réécrit anti-${detectorTarget}
|
||
etc...`;
|
||
|
||
return prompt;
|
||
}
|
||
|
||
/**
|
||
* Créer prompt d'enhancement adversarial
|
||
*/
|
||
function createEnhancementPrompt(elementsToEnhance, config, strategy) {
|
||
const { detectorTarget, intensity } = config;
|
||
|
||
let prompt = `MISSION: Améliore subtilement ces contenus pour réduire détection ${detectorTarget}.
|
||
|
||
AMÉLIORATIONS CIBLÉES:
|
||
${strategy.getEnhancementTips(intensity).join('\n')}
|
||
|
||
ÉLÉMENTS À AMÉLIORER:
|
||
|
||
${elementsToEnhance.map((element, i) => `[${i + 1}] TAG: ${element.tag}
|
||
CONTENU: "${element.content}"
|
||
PROBLÈME: ${element.detectionRisk}`).join('\n\n')}
|
||
|
||
CONSIGNES:
|
||
- Modifications LÉGÈRES et naturelles
|
||
- GARDE le fond du message intact
|
||
- Focus sur réduction détection ${detectorTarget}
|
||
- Intensité: ${intensity.toFixed(2)}
|
||
|
||
FORMAT DE RÉPONSE OBLIGATOIRE (UN PAR LIGNE):
|
||
[1] Contenu légèrement amélioré pour élément 1
|
||
[2] Contenu légèrement amélioré pour élément 2
|
||
[3] Contenu légèrement amélioré pour élément 3
|
||
etc...
|
||
|
||
IMPORTANT:
|
||
- Réponds UNIQUEMENT avec les contenus améliorés
|
||
- GARDE le numéro [N] devant chaque contenu
|
||
- PAS d'explications, PAS de commentaires
|
||
- RESPECTE STRICTEMENT le format [N] Contenu`;
|
||
|
||
return prompt;
|
||
}
|
||
|
||
/**
|
||
* Parser réponse régénération
|
||
*/
|
||
function parseRegenerationResponse(response, chunk) {
|
||
const results = {};
|
||
const regex = /\[(\d+)\]\s*([^[]*?)(?=\n\[\d+\]|$)/gs;
|
||
let match;
|
||
const parsedItems = {};
|
||
|
||
while ((match = regex.exec(response)) !== null) {
|
||
const index = parseInt(match[1]) - 1;
|
||
const content = cleanAdversarialContent(match[2].trim());
|
||
if (index >= 0 && index < chunk.length) {
|
||
parsedItems[index] = content;
|
||
}
|
||
}
|
||
|
||
// Mapper aux vrais tags
|
||
chunk.forEach(([tag, originalContent], index) => {
|
||
if (parsedItems[index] && parsedItems[index].length > 10) {
|
||
results[tag] = parsedItems[index];
|
||
} else {
|
||
results[tag] = originalContent; // Fallback
|
||
logSh(`⚠️ Fallback régénération pour [${tag}]`, 'WARNING');
|
||
}
|
||
});
|
||
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* Parser réponse enhancement
|
||
*/
|
||
function parseEnhancementResponse(response, elementsToEnhance) {
|
||
const results = {};
|
||
|
||
// Log réponse brute pour debug
|
||
logSh(`📥 Réponse LLM (${response.length} chars): ${response.substring(0, 200)}...`, 'DEBUG');
|
||
|
||
const regex = /\[(\d+)\]\s*([^[]*?)(?=\n\[\d+\]|$)/gs;
|
||
let match;
|
||
const parsedIndexes = new Set();
|
||
|
||
while ((match = regex.exec(response)) !== null) {
|
||
const num = parseInt(match[1]);
|
||
const index = num - 1; // [1] = index 0
|
||
|
||
if (index >= 0 && index < elementsToEnhance.length && !parsedIndexes.has(index)) {
|
||
let enhancedContent = cleanAdversarialContent(match[2].trim());
|
||
const element = elementsToEnhance[index];
|
||
|
||
if (enhancedContent && enhancedContent.length > 10) {
|
||
results[element.tag] = enhancedContent;
|
||
parsedIndexes.add(index);
|
||
logSh(` ✅ Parsé [${num}] ${element.tag}: ${enhancedContent.substring(0, 50)}...`, 'DEBUG');
|
||
} else {
|
||
logSh(` ⚠️ [${num}] ${element.tag}: contenu trop court (${enhancedContent?.length || 0} chars)`, 'WARNING');
|
||
}
|
||
}
|
||
}
|
||
|
||
// Vérifier si parsing a échoué
|
||
if (Object.keys(results).length === 0 && elementsToEnhance.length > 0) {
|
||
logSh(`❌ PARSING ÉCHOUÉ: Aucun élément parsé (format LLM invalide)`, 'ERROR');
|
||
logSh(` Réponse complète: ${response}`, 'ERROR');
|
||
|
||
// FALLBACK: Essayer parsing alternatif (sans numéros)
|
||
logSh(` 🔄 Tentative parsing alternatif...`, 'WARNING');
|
||
|
||
// Diviser par double saut de ligne ou tirets
|
||
const chunks = response.split(/\n\n+|---+/).map(c => c.trim()).filter(c => c.length > 10);
|
||
|
||
chunks.forEach((chunk, idx) => {
|
||
if (idx < elementsToEnhance.length) {
|
||
const cleaned = cleanAdversarialContent(chunk);
|
||
if (cleaned && cleaned.length > 10) {
|
||
results[elementsToEnhance[idx].tag] = cleaned;
|
||
logSh(` ✅ Fallback [${idx + 1}]: ${cleaned.substring(0, 50)}...`, 'DEBUG');
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
logSh(`📦 Résultat parsing: ${Object.keys(results).length}/${elementsToEnhance.length} éléments extraits`, 'DEBUG');
|
||
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* Sélectionner éléments pour enhancement
|
||
*/
|
||
function selectElementsForEnhancement(existingContent, config) {
|
||
const elements = [];
|
||
|
||
// ✅ Threshold basé sur intensity
|
||
// intensity >= 1.0 → threshold = 0.3 (traiter risque moyen/élevé)
|
||
// intensity < 1.0 → threshold = 0.4 (traiter uniquement risque élevé)
|
||
const threshold = config.intensity >= 1.0 ? 0.3 : 0.4;
|
||
|
||
logSh(`🎯 Sélection enhancement avec threshold=${(threshold * 100).toFixed(0)}% (intensity=${config.intensity})`, 'DEBUG');
|
||
|
||
Object.entries(existingContent).forEach(([tag, content]) => {
|
||
const detectionRisk = assessDetectionRisk(content, config.detectorTarget);
|
||
|
||
if (detectionRisk.score > threshold) {
|
||
elements.push({
|
||
tag,
|
||
content,
|
||
detectionRisk: detectionRisk.reasons.join(', ') || 'prévention_générale',
|
||
priority: detectionRisk.score
|
||
});
|
||
logSh(` ✅ [${tag}] Sélectionné: score=${(detectionRisk.score * 100).toFixed(0)}% > ${(threshold * 100).toFixed(0)}%`, 'INFO');
|
||
} else {
|
||
// Log éléments ignorés pour debug
|
||
logSh(` ⏭️ [${tag}] Ignoré: score=${(detectionRisk.score * 100).toFixed(0)}% ≤ ${(threshold * 100).toFixed(0)}%`, 'DEBUG');
|
||
}
|
||
});
|
||
|
||
// Trier par priorité (risque élevé en premier)
|
||
elements.sort((a, b) => b.priority - a.priority);
|
||
|
||
logSh(` 📊 Sélection: ${elements.length}/${Object.keys(existingContent).length} éléments (threshold=${(threshold * 100).toFixed(0)}%)`, 'DEBUG');
|
||
|
||
return elements;
|
||
}
|
||
|
||
/**
|
||
* Sélectionner éléments clés pour régénération (hybride)
|
||
*/
|
||
function selectKeyElementsForRegeneration(content, config) {
|
||
const keyTags = [];
|
||
|
||
Object.keys(content).forEach(tag => {
|
||
// Éléments clés: titres, intro, premiers paragraphes
|
||
if (tag.includes('Titre') || tag.includes('H1') || tag.includes('intro') ||
|
||
tag.includes('Introduction') || tag.includes('1')) {
|
||
keyTags.push(tag);
|
||
}
|
||
});
|
||
|
||
return keyTags.slice(0, 3); // Maximum 3 éléments clés
|
||
}
|
||
|
||
/**
|
||
* Évaluer risque de détection (approche statistique générique)
|
||
* Basé sur des métriques linguistiques universelles sans mots hardcodés
|
||
*/
|
||
function assessDetectionRisk(content, detectorTarget) {
|
||
const reasons = [];
|
||
|
||
// Parsing de base
|
||
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
||
const words = content.split(/\s+/).filter(w => w.length > 0);
|
||
|
||
// Validation & Mode texte court
|
||
if (words.length < 5) {
|
||
return { score: 0, reasons: ['texte_trop_court(<5_mots)'], metrics: {} };
|
||
}
|
||
|
||
// ✅ MODE TEXTE COURT (1 phrase ou <10 mots)
|
||
if (sentences.length < 2 || words.length < 10) {
|
||
return assessShortTextRisk(content, words, detectorTarget);
|
||
}
|
||
|
||
// === CALCULER TOUTES LES MÉTRIQUES ===
|
||
const metrics = {
|
||
lexicalDiversity: calculateLexicalDiversity(words),
|
||
burstiness: calculateBurstiness(sentences),
|
||
syntaxEntropy: calculateSyntaxEntropy(sentences),
|
||
punctuationComplexity: calculatePunctuationComplexity(content),
|
||
redundancy: calculateRedundancy(words),
|
||
wordUniformity: calculateWordUniformity(words)
|
||
};
|
||
|
||
// === SCORING ADAPTATIF PAR DÉTECTEUR ===
|
||
let score = 0;
|
||
|
||
if (detectorTarget === 'gptZero') {
|
||
// GPTZero privilégie : perplexité + burstiness
|
||
score += metrics.lexicalDiversity.score * 0.30;
|
||
score += metrics.burstiness.score * 0.25;
|
||
score += metrics.syntaxEntropy.score * 0.15;
|
||
score += metrics.punctuationComplexity.score * 0.10;
|
||
score += metrics.redundancy.score * 0.10;
|
||
score += metrics.wordUniformity.score * 0.10;
|
||
|
||
if (metrics.lexicalDiversity.score > 0.3 && metrics.burstiness.score > 0.3) {
|
||
score += 0.05; // Bonus si double flag
|
||
reasons.push('gptzero_double_flag');
|
||
}
|
||
|
||
} else if (detectorTarget === 'originality') {
|
||
// Originality.ai privilégie : redondance + entropie syntaxique
|
||
score += metrics.redundancy.score * 0.30;
|
||
score += metrics.syntaxEntropy.score * 0.25;
|
||
score += metrics.lexicalDiversity.score * 0.15;
|
||
score += metrics.burstiness.score * 0.15;
|
||
score += metrics.punctuationComplexity.score * 0.10;
|
||
score += metrics.wordUniformity.score * 0.05;
|
||
|
||
if (metrics.redundancy.score > 0.4) {
|
||
score += 0.05; // Bonus haute redondance
|
||
reasons.push('originality_redondance_élevée');
|
||
}
|
||
|
||
} else {
|
||
// Détecteur général : ponctuation = meilleur indicateur (40%)
|
||
// Les LLMs modernes ont bon TTR et burstiness, mais ponctuation trop simple
|
||
const weights = [0.10, 0.20, 0.10, 0.40, 0.15, 0.05];
|
||
const metricScores = Object.values(metrics).map(m => m.score);
|
||
score = metricScores.reduce((sum, s, i) => sum + s * weights[i], 0);
|
||
}
|
||
|
||
// Collecter raisons
|
||
Object.entries(metrics).forEach(([name, data]) => {
|
||
if (data.score > 0.3) { // Seuil significatif
|
||
reasons.push(data.reason);
|
||
}
|
||
});
|
||
|
||
return {
|
||
score: Math.min(1, score),
|
||
reasons: reasons.length > 0 ? reasons : ['analyse_générale'],
|
||
metrics // Retourner pour debug
|
||
};
|
||
}
|
||
|
||
// ============= HELPER FUNCTIONS - MÉTRIQUES STATISTIQUES =============
|
||
|
||
/**
|
||
* 1️⃣ Diversité lexicale (Type-Token Ratio)
|
||
*/
|
||
function calculateLexicalDiversity(words) {
|
||
const cleanWords = words.map(w => w.toLowerCase().replace(/[^\w]/g, '')).filter(w => w.length > 0);
|
||
const uniqueWords = new Set(cleanWords);
|
||
const ttr = uniqueWords.size / cleanWords.length;
|
||
|
||
// TTR < 0.5 = vocabulaire répétitif (IA)
|
||
let score = 0;
|
||
if (ttr < 0.5) {
|
||
score = (0.5 - ttr) / 0.5; // Normaliser 0.5→0 = 0, 0→0.5 = 1
|
||
}
|
||
|
||
return {
|
||
score,
|
||
value: ttr,
|
||
reason: `low_lexical_diversity(TTR=${ttr.toFixed(2)})`
|
||
};
|
||
}
|
||
|
||
/**
|
||
* 2️⃣ Burstiness (Variation longueur phrases)
|
||
*/
|
||
function calculateBurstiness(sentences) {
|
||
const lengths = sentences.map(s => s.length);
|
||
const avg = lengths.reduce((a, b) => a + b, 0) / lengths.length;
|
||
const variance = lengths.reduce((sum, len) => sum + Math.pow(len - avg, 2), 0) / lengths.length;
|
||
const stdDev = Math.sqrt(variance);
|
||
const cv = stdDev / avg; // Coefficient de variation
|
||
|
||
// ✅ FIX: Seuil abaissé de 0.35 à 0.25 (LLMs modernes plus uniformes)
|
||
// CV < 0.25 = phrases très uniformes (IA moderne)
|
||
let score = 0;
|
||
if (cv < 0.25) {
|
||
score = (0.25 - cv) / 0.25; // Normaliser 0.25→0 = 0, 0→0.25 = 1
|
||
}
|
||
|
||
return {
|
||
score,
|
||
value: cv,
|
||
reason: `low_burstiness(CV=${cv.toFixed(2)})`
|
||
};
|
||
}
|
||
|
||
/**
|
||
* 3️⃣ Entropie syntaxique (Débuts de phrases répétés)
|
||
*/
|
||
function calculateSyntaxEntropy(sentences) {
|
||
const starts = sentences.map(s => {
|
||
const words = s.trim().split(/\s+/);
|
||
return words.slice(0, 2).join(' ').toLowerCase();
|
||
});
|
||
|
||
const freq = {};
|
||
starts.forEach(start => {
|
||
freq[start] = (freq[start] || 0) + 1;
|
||
});
|
||
|
||
const maxFreq = Math.max(...Object.values(freq));
|
||
const entropy = maxFreq / sentences.length;
|
||
|
||
// Entropie > 0.5 = >50% phrases commencent pareil (monotone)
|
||
let score = 0;
|
||
if (entropy > 0.5) {
|
||
score = (entropy - 0.5) / 0.5; // Normaliser 0.5→1 = 0→1
|
||
}
|
||
|
||
return {
|
||
score,
|
||
value: entropy,
|
||
reason: `high_syntax_entropy(${(entropy * 100).toFixed(0)}%)`
|
||
};
|
||
}
|
||
|
||
/**
|
||
* 4️⃣ Complexité ponctuation
|
||
*/
|
||
function calculatePunctuationComplexity(content) {
|
||
const simplePunct = (content.match(/[.,]/g) || []).length;
|
||
const complexPunct = (content.match(/[;:!?()—…]/g) || []).length;
|
||
const total = simplePunct + complexPunct;
|
||
|
||
if (total === 0) {
|
||
return { score: 0, value: 0, reason: 'no_punctuation' };
|
||
}
|
||
|
||
const ratio = complexPunct / total;
|
||
|
||
// Ratio < 0.1 = ponctuation trop simple (IA)
|
||
let score = 0;
|
||
if (ratio < 0.1) {
|
||
score = (0.1 - ratio) / 0.1; // Normaliser 0.1→0 = 0, 0→0.1 = 1
|
||
}
|
||
|
||
return {
|
||
score,
|
||
value: ratio,
|
||
reason: `low_punctuation_complexity(${(ratio * 100).toFixed(0)}%)`
|
||
};
|
||
}
|
||
|
||
/**
|
||
* 5️⃣ Redondance structurelle (Bigrammes répétés)
|
||
*/
|
||
function calculateRedundancy(words) {
|
||
const bigrams = [];
|
||
for (let i = 0; i < words.length - 1; i++) {
|
||
const bigram = `${words[i]} ${words[i + 1]}`.toLowerCase();
|
||
bigrams.push(bigram);
|
||
}
|
||
|
||
const freq = {};
|
||
bigrams.forEach(bg => {
|
||
freq[bg] = (freq[bg] || 0) + 1;
|
||
});
|
||
|
||
const repeatedCount = Object.values(freq).filter(count => count > 1).length;
|
||
const redundancy = repeatedCount / bigrams.length;
|
||
|
||
// Redondance > 0.2 = 20%+ bigrammes répétés (IA)
|
||
let score = 0;
|
||
if (redundancy > 0.2) {
|
||
score = Math.min(1, (redundancy - 0.2) / 0.3); // Normaliser 0.2→0.5 = 0→1
|
||
}
|
||
|
||
return {
|
||
score,
|
||
value: redundancy,
|
||
reason: `high_redundancy(${(redundancy * 100).toFixed(0)}%)`
|
||
};
|
||
}
|
||
|
||
/**
|
||
* 6️⃣ Uniformité longueur mots
|
||
*/
|
||
function calculateWordUniformity(words) {
|
||
const lengths = words.map(w => w.replace(/[^\w]/g, '').length).filter(l => l > 0);
|
||
|
||
if (lengths.length === 0) {
|
||
return { score: 0, value: 0, reason: 'no_words' };
|
||
}
|
||
|
||
const avg = lengths.reduce((a, b) => a + b, 0) / lengths.length;
|
||
const variance = lengths.reduce((sum, len) => sum + Math.pow(len - avg, 2), 0) / lengths.length;
|
||
const stdDev = Math.sqrt(variance);
|
||
|
||
// StdDev < 2.5 ET moyenne 4-8 lettres = mots uniformes (IA)
|
||
let score = 0;
|
||
if (stdDev < 2.5 && avg >= 4 && avg <= 8) {
|
||
score = (2.5 - stdDev) / 2.5; // Normaliser 2.5→0 = 0, 0→2.5 = 1
|
||
}
|
||
|
||
return {
|
||
score,
|
||
value: stdDev,
|
||
reason: `uniform_word_length(σ=${stdDev.toFixed(1)}, avg=${avg.toFixed(1)})`
|
||
};
|
||
}
|
||
|
||
/**
|
||
* ✅ MODE SPÉCIAL: Évaluation textes courts (1 phrase ou <10 mots)
|
||
* Utilise métriques adaptées aux textes courts
|
||
*/
|
||
function assessShortTextRisk(content, words, detectorTarget) {
|
||
let score = 0;
|
||
const reasons = [];
|
||
|
||
// === MÉTRIQUE 1: Complexité ponctuation (poids 50%) ===
|
||
const simplePunct = (content.match(/[.,]/g) || []).length;
|
||
const complexPunct = (content.match(/[;:!?()—…]/g) || []).length;
|
||
const total = simplePunct + complexPunct;
|
||
|
||
let punctScore = 0;
|
||
if (total > 0) {
|
||
const ratio = complexPunct / total;
|
||
if (ratio < 0.1) {
|
||
punctScore = (0.1 - ratio) / 0.1;
|
||
reasons.push(`low_punctuation(${(ratio * 100).toFixed(0)}%)`);
|
||
}
|
||
} else {
|
||
// Aucune ponctuation = suspect
|
||
punctScore = 0.3;
|
||
reasons.push('no_punctuation');
|
||
}
|
||
|
||
score += punctScore * 0.50;
|
||
|
||
// === MÉTRIQUE 2: Longueur moyenne mots (poids 30%) ===
|
||
const lengths = words.map(w => w.replace(/[^\w]/g, '').length).filter(l => l > 0);
|
||
if (lengths.length > 0) {
|
||
const avg = lengths.reduce((a, b) => a + b) / lengths.length;
|
||
// Mots trop longs = formel/IA (avg > 7 lettres)
|
||
if (avg > 7) {
|
||
const wordLengthScore = (avg - 7) / 5; // Normaliser 7→12 = 0→1
|
||
score += Math.min(1, wordLengthScore) * 0.30;
|
||
reasons.push(`long_words(avg=${avg.toFixed(1)})`);
|
||
}
|
||
}
|
||
|
||
// === MÉTRIQUE 3: Ton formel (poids 20%) ===
|
||
const lowerContent = content.toLowerCase();
|
||
|
||
// Mots formels suspects
|
||
const formalWords = ['optimal', 'idéal', 'efficace', 'robuste', 'innovant', 'essentiel', 'crucial'];
|
||
const formalCount = formalWords.reduce((c, w) => c + (lowerContent.includes(w) ? 1 : 0), 0);
|
||
|
||
// Mots casual
|
||
const casualWords = ['super', 'top', 'cool', 'bref', 'truc', 'machin', 'genre'];
|
||
const casualCount = casualWords.reduce((c, w) => c + (lowerContent.includes(w) ? 1 : 0), 0);
|
||
|
||
if (formalCount > 0 && casualCount === 0 && words.length > 5) {
|
||
score += 0.20;
|
||
reasons.push(`formal_tone(${formalCount}_mots)`);
|
||
}
|
||
|
||
return {
|
||
score: Math.min(1, score),
|
||
reasons: reasons.length > 0 ? reasons : ['short_text_ok'],
|
||
metrics: {
|
||
textLength: words.length,
|
||
punctuationRatio: total > 0 ? complexPunct / total : 0,
|
||
avgWordLength: lengths.length > 0 ? lengths.reduce((a, b) => a + b) / lengths.length : 0
|
||
}
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Nettoyer contenu adversarial généré
|
||
*/
|
||
function cleanAdversarialContent(content) {
|
||
if (!content) return content;
|
||
|
||
// Supprimer préfixes indésirables
|
||
content = content.replace(/^(voici\s+)?le\s+contenu\s+(réécrit|amélioré)[:\s]*/gi, '');
|
||
content = content.replace(/^(bon,?\s*)?(alors,?\s*)?/gi, '');
|
||
content = content.replace(/\*\*[^*]+\*\*/g, '');
|
||
content = content.replace(/\s{2,}/g, ' ');
|
||
content = content.trim();
|
||
|
||
return content;
|
||
}
|
||
|
||
/**
|
||
* Compter éléments modifiés
|
||
*/
|
||
function countModifiedElements(original, modified) {
|
||
let count = 0;
|
||
|
||
Object.keys(original).forEach(tag => {
|
||
if (modified[tag] && modified[tag] !== original[tag]) {
|
||
count++;
|
||
}
|
||
});
|
||
|
||
return count;
|
||
}
|
||
|
||
/**
|
||
* Chunk array utility
|
||
*/
|
||
function chunkArray(array, size) {
|
||
const chunks = [];
|
||
for (let i = 0; i < array.length; i += size) {
|
||
chunks.push(array.slice(i, i + size));
|
||
}
|
||
return chunks;
|
||
}
|
||
|
||
/**
|
||
* Sleep utility
|
||
*/
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
module.exports = {
|
||
applyAdversarialLayer, // ← MAIN ENTRY POINT MODULAIRE
|
||
applyRegenerationMethod,
|
||
applyEnhancementMethod,
|
||
applyHybridMethod,
|
||
assessDetectionRisk,
|
||
selectElementsForEnhancement
|
||
}; |