subtitles Streaming / Entertainment
Upload any video or audio file. Workers AI routes to the best STT model per language — Nova-3 for English, Whisper for Thai and other local languages — then runs a two-pass AI correction before exporting a timestamped WebVTT file.
# Extract audio only — 2hr lecture → ~50 MB M4A ffmpeg -i lecture.mp4 -vn -acodec copy lecture.m4a # Or convert to MP3 at 64 kbps (smaller, still great quality) ffmpeg -i lecture.mp4 -vn -acodec libmp3lame -b:a 64k lecture.mp3
// Bind Workers AI as AI in wrangler.toml: [ai] binding = "AI"
// Helper: convert seconds to WebVTT timestamp "HH:MM:SS.mmm"
function toVttTime(s) {
const h = Math.floor(s / 3600), m = Math.floor((s % 3600) / 60), sec = s % 60
return String(h).padStart(2,'0') + ':' + String(m).padStart(2,'0') + ':' +
sec.toFixed(3).padStart(6,'0')
}
// Helper: encode ArrayBuffer to base64 without Node.js Buffer
function bufferToBase64(ab) {
return btoa(String.fromCharCode(...new Uint8Array(ab)))
}
export default {
async fetch(request, env) {
if (request.method !== 'POST') return fetch(request)
const form = await request.formData()
const file = form.get('file')
const language = form.get('language') || 'auto'
const buffer = await file.arrayBuffer()
// Model routing based on language:
// Thai/ASEAN -> Whisper large-v3-turbo ($0.0005/min)
// English/EU -> Deepgram Nova-3 ($0.0052/min, lower WER)
const ASEAN = ['th','id','vi','ms','km','lo','my','tl']
const isAsean = ASEAN.includes(language)
let transcript = ''
let segments = []
if (isAsean) {
// Whisper requires base64-encoded audio — use btoa(), not Node.js Buffer
const base64 = bufferToBase64(buffer)
const result = await env.AI.run('@cf/openai/whisper-large-v3-turbo', {
audio: base64, language
})
transcript = result.text ?? ''
segments = (result.segments ?? []).map(s => ({
start: s.start, end: s.end, text: s.text
}))
} else {
// Nova-3 accepts a ReadableStream for audio body
const stream = new ReadableStream({
start(c) { c.enqueue(new Uint8Array(buffer)); c.close() }
})
const result = await env.AI.run('@cf/deepgram/nova-3', {
audio: { body: stream, contentType: file.type },
smart_format: true, utterances: true, detect_language: true
})
segments = (result.utterances ?? []).map(u => ({
start: u.start, end: u.end, text: u.transcript
}))
transcript = result.results?.channels?.[0]?.alternatives?.[0]?.transcript ?? ''
}
// Build WebVTT inline — no external helper needed
const vtt = 'WEBVTT\n\n' + segments.map((s, i) =>
(i + 1) + '\n' + toVttTime(s.start) + ' --> ' + toVttTime(s.end) + '\n' + s.text
).join('\n\n')
// After STT:
// SEA-LION 27B cleans per-chunk hallucinations (foreign script, garbage words)
// GPT-OSS 120B corrects domain vocabulary across the full transcript
return Response.json({ transcript, segments, vtt })
}
}Productionising this