transcriber/config.yaml.example at main · RedAtman/transcriber · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Transcriber Configuration File
# This file contains all default configuration values.
# Uncomment and modify any values as needed.

model:
  # Model name: tiny/base/small/medium/large-v3-turbo
  name: base
  # Language code (ISO 639-1, e.g. zh/en/ja) or "auto" for auto-detection
  language: auto
  # Quantization: q4_k/q5_k/q6_k/q8_0 (optional, leave empty for F16)
  # quantization: q5_k

audio:
  # Sample rate for Whisper (fixed at 16kHz)
  sample_rate: 16000
  # Audio channels (fixed at mono)
  channels: 1

performance:
  # Number of CPU threads (0 = auto detect)
  threads: 0
  # GPU backend: auto/metal/cuda/vulkan/cpu
  gpu: auto

output:
  # Output formats: txt/srt/json/all (comma separated)
  formats:
    - txt
  # Output directory (empty = same as input file)
  directory: ""
  # Skip files that already have transcript output
  skip_existing: true
  # Stream output to file during transcription (progressive writes)
  streaming: true

logging:
  # Log level: trace/debug/info/warn
  level: info
  # Log file path (empty = no file logging)
  file: ""
  # Colored terminal output
  colors: true

cache:
  # Model cache directory
  directory: ~/.cache/whisper

inference:
  # Initial prompt for decoder context (optional)
  initial_prompt: ""
  # Sampling temperature (0.0 = deterministic, 1.0 = more random)
  temperature: 0.0
  # Suppress non-speech tokens
  suppress_non_speech: false
  # No-speech detection threshold (0.0-1.0)
  no_speech_threshold: 0.6
  # Maximum segment length in characters (0 = no limit)
  max_segment_length: 0
  # Split timestamps on word boundaries
  split_on_word: false

# Custom metadata injected into markdown (md) output as YAML front matter.
# These are arbitrary key-value pairs. Also configurable via --meta key=value CLI.
# custom_metadata:
#   description: "Lecture recording"
#   speaker: "John Doe"
#   location: "Beijing"