272 lines
7.7 KiB
JavaScript
272 lines
7.7 KiB
JavaScript
import OpenAI from 'openai';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
|
|
let openai = null;
|
|
|
|
// Max characters per chunk (~6000 tokens ≈ 24000 characters for most languages)
|
|
const MAX_CHUNK_CHARS = 20000;
|
|
|
|
const LANGUAGES = {
|
|
en: 'English',
|
|
fr: 'French',
|
|
es: 'Spanish',
|
|
de: 'German',
|
|
it: 'Italian',
|
|
pt: 'Portuguese',
|
|
zh: 'Chinese',
|
|
ja: 'Japanese',
|
|
ko: 'Korean',
|
|
ru: 'Russian',
|
|
ar: 'Arabic',
|
|
hi: 'Hindi',
|
|
nl: 'Dutch',
|
|
pl: 'Polish',
|
|
tr: 'Turkish',
|
|
vi: 'Vietnamese',
|
|
th: 'Thai',
|
|
sv: 'Swedish',
|
|
da: 'Danish',
|
|
fi: 'Finnish',
|
|
no: 'Norwegian',
|
|
cs: 'Czech',
|
|
el: 'Greek',
|
|
he: 'Hebrew',
|
|
id: 'Indonesian',
|
|
ms: 'Malay',
|
|
ro: 'Romanian',
|
|
uk: 'Ukrainian',
|
|
};
|
|
|
|
// Sentence ending patterns for different languages
|
|
const SENTENCE_ENDINGS = /[.!?。!?。\n]/g;
|
|
|
|
/**
|
|
* Get OpenAI client (lazy initialization)
|
|
*/
|
|
function getOpenAI() {
|
|
if (!openai) {
|
|
if (!process.env.OPENAI_API_KEY) {
|
|
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
}
|
|
openai = new OpenAI({
|
|
apiKey: process.env.OPENAI_API_KEY,
|
|
});
|
|
}
|
|
return openai;
|
|
}
|
|
|
|
/**
|
|
* Split text into chunks at sentence boundaries
|
|
* @param {string} text - Text to split
|
|
* @param {number} maxChars - Maximum characters per chunk
|
|
* @returns {string[]} Array of text chunks
|
|
*/
|
|
function splitIntoChunks(text, maxChars = MAX_CHUNK_CHARS) {
|
|
if (text.length <= maxChars) {
|
|
return [text];
|
|
}
|
|
|
|
const chunks = [];
|
|
let currentPos = 0;
|
|
|
|
while (currentPos < text.length) {
|
|
let endPos = currentPos + maxChars;
|
|
|
|
// If we're at the end, just take the rest
|
|
if (endPos >= text.length) {
|
|
chunks.push(text.slice(currentPos));
|
|
break;
|
|
}
|
|
|
|
// Find the last sentence ending before maxChars
|
|
const searchText = text.slice(currentPos, endPos);
|
|
let lastSentenceEnd = -1;
|
|
|
|
// Find all sentence endings in the search range
|
|
let match;
|
|
SENTENCE_ENDINGS.lastIndex = 0;
|
|
while ((match = SENTENCE_ENDINGS.exec(searchText)) !== null) {
|
|
lastSentenceEnd = match.index + 1; // Include the punctuation
|
|
}
|
|
|
|
// If we found a sentence ending, cut there
|
|
// Otherwise, look for the next sentence ending after maxChars (up to 20% more)
|
|
if (lastSentenceEnd > maxChars * 0.5) {
|
|
endPos = currentPos + lastSentenceEnd;
|
|
} else {
|
|
// Look forward for a sentence ending (up to 20% more characters)
|
|
const extendedSearch = text.slice(endPos, endPos + maxChars * 0.2);
|
|
SENTENCE_ENDINGS.lastIndex = 0;
|
|
const forwardMatch = SENTENCE_ENDINGS.exec(extendedSearch);
|
|
if (forwardMatch) {
|
|
endPos = endPos + forwardMatch.index + 1;
|
|
}
|
|
// If still no sentence ending found, just cut at maxChars
|
|
}
|
|
|
|
chunks.push(text.slice(currentPos, endPos).trim());
|
|
currentPos = endPos;
|
|
|
|
// Skip any leading whitespace for the next chunk
|
|
while (currentPos < text.length && /\s/.test(text[currentPos])) {
|
|
currentPos++;
|
|
}
|
|
}
|
|
|
|
return chunks.filter(chunk => chunk.length > 0);
|
|
}
|
|
|
|
/**
|
|
* Get available languages
|
|
*/
|
|
export function getLanguages() {
|
|
return LANGUAGES;
|
|
}
|
|
|
|
/**
|
|
* Translate a single chunk of text
|
|
*/
|
|
async function translateChunk(text, targetLanguage, sourceLanguage) {
|
|
const prompt = sourceLanguage
|
|
? `Translate the following text from ${sourceLanguage} to ${targetLanguage}. Only output the translation, nothing else:\n\n${text}`
|
|
: `Translate the following text to ${targetLanguage}. Only output the translation, nothing else:\n\n${text}`;
|
|
|
|
const response = await getOpenAI().chat.completions.create({
|
|
model: 'gpt-4o-mini',
|
|
max_tokens: 16384,
|
|
messages: [
|
|
{
|
|
role: 'user',
|
|
content: prompt,
|
|
},
|
|
],
|
|
});
|
|
|
|
return response.choices[0].message.content;
|
|
}
|
|
|
|
/**
|
|
* Translate text using GPT-4o-mini with chunking for long texts
|
|
* @param {string} text - Text to translate
|
|
* @param {string} targetLang - Target language code (e.g., 'en', 'fr')
|
|
* @param {string} sourceLang - Source language code (optional, auto-detect if null)
|
|
*/
|
|
export async function translateText(text, targetLang, sourceLang = null) {
|
|
if (!text || !text.trim()) {
|
|
throw new Error('No text provided for translation');
|
|
}
|
|
|
|
const targetLanguage = LANGUAGES[targetLang] || targetLang;
|
|
const sourceLanguage = sourceLang ? (LANGUAGES[sourceLang] || sourceLang) : null;
|
|
|
|
try {
|
|
// Split text into chunks
|
|
const chunks = splitIntoChunks(text);
|
|
|
|
if (chunks.length === 1) {
|
|
// Single chunk - translate directly
|
|
const translation = await translateChunk(text, targetLanguage, sourceLanguage);
|
|
return {
|
|
success: true,
|
|
originalText: text,
|
|
translatedText: translation,
|
|
targetLanguage: targetLanguage,
|
|
sourceLanguage: sourceLanguage || 'auto-detected',
|
|
chunks: 1,
|
|
};
|
|
}
|
|
|
|
// Multiple chunks - translate each and combine
|
|
console.log(`Splitting text into ${chunks.length} chunks for translation...`);
|
|
const translations = [];
|
|
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
console.log(` Translating chunk ${i + 1}/${chunks.length} (${chunks[i].length} chars)...`);
|
|
const translation = await translateChunk(chunks[i], targetLanguage, sourceLanguage);
|
|
translations.push(translation);
|
|
}
|
|
|
|
const combinedTranslation = translations.join('\n\n');
|
|
|
|
return {
|
|
success: true,
|
|
originalText: text,
|
|
translatedText: combinedTranslation,
|
|
targetLanguage: targetLanguage,
|
|
sourceLanguage: sourceLanguage || 'auto-detected',
|
|
chunks: chunks.length,
|
|
};
|
|
} catch (error) {
|
|
throw new Error(`Translation failed: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Translate a text file
|
|
* @param {string} filePath - Path to text file
|
|
* @param {string} targetLang - Target language code
|
|
* @param {string} sourceLang - Source language code (optional)
|
|
* @param {string} outputDir - Output directory (optional)
|
|
*/
|
|
export async function translateFile(filePath, targetLang, sourceLang = null, outputDir = null) {
|
|
if (!fs.existsSync(filePath)) {
|
|
throw new Error(`File not found: ${filePath}`);
|
|
}
|
|
|
|
const text = fs.readFileSync(filePath, 'utf-8');
|
|
const result = await translateText(text, targetLang, sourceLang);
|
|
|
|
// Save translation
|
|
const baseName = path.basename(filePath, path.extname(filePath));
|
|
const outputPath = path.join(
|
|
outputDir || path.dirname(filePath),
|
|
`${baseName}_${targetLang}.txt`
|
|
);
|
|
|
|
fs.writeFileSync(outputPath, result.translatedText, 'utf-8');
|
|
|
|
return {
|
|
...result,
|
|
originalPath: filePath,
|
|
translationPath: outputPath,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Translate multiple files
|
|
*/
|
|
export async function translateMultiple(filePaths, targetLang, sourceLang = null, outputDir = null, onProgress = null) {
|
|
const results = [];
|
|
|
|
for (let i = 0; i < filePaths.length; i++) {
|
|
const filePath = filePaths[i];
|
|
|
|
if (onProgress) {
|
|
onProgress({ current: i + 1, total: filePaths.length, filePath });
|
|
}
|
|
|
|
console.log(`[${i + 1}/${filePaths.length}] Translating: ${path.basename(filePath)}`);
|
|
|
|
try {
|
|
const result = await translateFile(filePath, targetLang, sourceLang, outputDir);
|
|
results.push(result);
|
|
} catch (error) {
|
|
console.error(`Failed to translate ${filePath}: ${error.message}`);
|
|
results.push({
|
|
success: false,
|
|
originalPath: filePath,
|
|
error: error.message,
|
|
});
|
|
}
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
results,
|
|
totalFiles: filePaths.length,
|
|
successCount: results.filter(r => r.success).length,
|
|
failCount: results.filter(r => !r.success).length,
|
|
};
|
|
}
|