pbs · foxPrateek · Aug 27, 2025 · Sep 4, 2025 · Sep 26, 2025 · OlteanuRares
@@ -78,6 +78,7 @@
  just carried over when implementing positioning.
 """
 
+import os
 import math
 import re
 import textwrap
@@ -367,13 +368,13 @@ def _handle_double_command(self, word):
         # doubled special characters and doubled extended characters
         # with only one member of each pair being displayed.
 
-        doubled_types = (word != "94a1" and word in COMMANDS) or _is_pac_command(word) or word in SPECIAL_CHARS
+        doubled_types = (
+            (word != "94a1" and word in COMMANDS)
+            or _is_pac_command(word)
+            or word in SPECIAL_CHARS
+        )
         if self.double_starter:
-            doubled_types = (
-                doubled_types
-                or word in EXTENDED_CHARS
-                or word == "94a1"
-            )
+            doubled_types = doubled_types or word in EXTENDED_CHARS or word == "94a1"
 
         if word in CUE_STARTING_COMMAND and word != self.last_command:
             self.double_starter = False
@@ -535,8 +536,9 @@ def _pop_on(self, end=0):
 
 
 class SCCWriter(BaseWriter):
-    def __init__(self, *args, **kw):
+    def __init__(self, *args, drop_frame=True, **kw):
         super().__init__(*args, **kw)
+        self.drop_frame = drop_frame
 
     def write(self, caption_set):
         output = HEADER + "\n\n"
@@ -549,7 +551,10 @@ def write(self, caption_set):
         # Only support one language.
         lang = list(caption_set.get_languages())[0]
         captions = caption_set.get_captions(lang)
-
+        if self.drop_frame == True:
+            MICROSECONDS_PER_CODEWORD = 1000.0 * 1000.0 / (30.0 * 1000.0 / 1001.0)
+        elif self.drop_frame == False:
+            MICROSECONDS_PER_CODEWORD = 1000.0 * 1000.0 / (30.0)
         # PASS 1: compute codes for each caption
         codes = [
             (self._text_to_code(caption), caption.start, caption.end)
@@ -560,25 +565,128 @@ def write(self, caption_set):
         # Advance start times so as to have time to write to the pop-on
         # buffer; possibly remove the previous clear-screen command
         for index, (code, start, end) in enumerate(codes):
-            code_words = len(code) / 5 + 8
-            code_time_microseconds = code_words * MICROSECONDS_PER_CODEWORD
+            # code_words = len(code) / 5 + 8
+            code_words = len(code.split()) + 8
+
+            code_time_microseconds = (code_words + 2) * MICROSECONDS_PER_CODEWORD
             code_start = start - code_time_microseconds
             if index == 0:
+                # also back-shift first cue so load-time is accounted for
+                codes[index] = (code, code_start, end)
                 continue
             previous_code, previous_start, previous_end = codes[index - 1]
-            if previous_end + 3 * MICROSECONDS_PER_CODEWORD >= code_start:
+            # concatenate overlapping code
+            if code_start <= (previous_start + MICROSECONDS_PER_CODEWORD):
+                prev_words = len(previous_code.split()) + 8  # ENM/RCL + overhead
+                code_start = max(
+                    code_start, previous_start + prev_words * MICROSECONDS_PER_CODEWORD
+                )
+                codes[index] = (code, code_start, end)
+                # Also ensure previous ends before this starts if they nearly touch
                 codes[index - 1] = (previous_code, previous_start, None)
-            codes[index] = (code, code_start, end)
+            else:
+                if (
+                    previous_end is not None
+                    and previous_end + 3 * MICROSECONDS_PER_CODEWORD >= code_start
+                ):
+                    codes[index - 1] = (previous_code, previous_start, None)
+                codes[index] = (code, code_start, end)
 
         # PASS 3:
+        # Remove empty captions due to code concatenation
+        codes = list(filter(None, codes))
+
+        # PASS 4:
         # Write captions.
+        def _df_frames(us: int) -> int:
+            # must mirror _format_timestamp_df quantization
+            return math.floor(us * 30 / 1_000_000 * 1000 / 1001 + 1e-9)
+
+        last_emitted_frames = -1
+
         for code, start, end in codes:
-            output += f"{self._format_timestamp(start)}\t"
+            # bump by one frame if this quantizes to the same DF frame as previous
+            cur_frames = _df_frames(start) if self.drop_frame else None
+            if self.drop_frame and cur_frames <= last_emitted_frames:
+                # add exactly one codeword's worth of time until frame advances
+                while _df_frames(start) <= last_emitted_frames:
+                    start += MICROSECONDS_PER_CODEWORD
+                cur_frames = _df_frames(start)
+            if self.drop_frame:
+                last_emitted_frames = cur_frames
+
+            if not self.drop_frame:
+                # bump by one codeword if same HH:MM:SS:FF as last
+                def _ndf_frames(us: int) -> int:
+                    # mirror _format_timestamp_ndf
+                    seconds_float = us / 1_000_000.0 * 1000.0 / 1001.0
+                    total_frames = int(seconds_float * 30)  # floor
+                    return total_frames
+
+                cur_frames = _ndf_frames(start)
+                if cur_frames <= last_emitted_frames:
+                    while _ndf_frames(start) <= last_emitted_frames:
+                        start += MICROSECONDS_PER_CODEWORD
+                last_emitted_frames = _ndf_frames(start)
+
+            # ---- MINIMAL SPLIT (only if >80 tokens) ----
+            _prefix = ["94ae", "94ae", "9420", "9420"]
+            _suffix = ["942f", "942f"]
+            _code_tokens = code.split()
+            _total = len(_prefix) + len(_code_tokens) + len(_suffix)
+            if _total > 80:
+                # first chunk at 'start'
+                _keep = 80 - len(_prefix) - len(_suffix)  # 74
+                _first = _prefix + _code_tokens[:_keep] + _suffix
+                if self.drop_frame:
+                    ts_start = self._format_timestamp_df(start)
+                else:
+                    ts_start = self._format_timestamp_ndf(start)
+                output += f"{ts_start}\t" + " ".join(_first) + "\n\n"
+
+                # second chunk at start + 1 codeword (and guard again)
+                start2 = start + MICROSECONDS_PER_CODEWORD
+                cur2 = _df_frames(start2) if self.drop_frame else None
+                if self.drop_frame and cur2 <= last_emitted_frames:
+                    while _df_frames(start2) <= last_emitted_frames:
+                        start2 += MICROSECONDS_PER_CODEWORD
+                    cur2 = _df_frames(start2)
+                if self.drop_frame:
+                    last_emitted_frames = cur2
+                _rest = _prefix + _code_tokens[_keep:] + _suffix
+                if self.drop_frame:
+                    ts2 = self._format_timestamp_df(start2)
+                else:
+                    ts2 = self._format_timestamp_ndf(start2)
+                output += f"{ts2}\t" + " ".join(_rest) + "\n\n"
+
+                if end is not None:
+                    ts_end = (
+                        self._format_timestamp_df(end)
+                        if self.drop_frame
+                        else self._format_timestamp_ndf(end)
+                    )
+                    output += f"{ts_end}\t942c 942c\n\n"
+                continue
+            # ---- /MINIMAL SPLIT ----
+            # choose formatter based on flag
+            if self.drop_frame:
+                ts_start = self._format_timestamp_df(start)
+            else:
+                ts_start = self._format_timestamp_ndf(start)
+
+            output += f"{ts_start}\t"
             output += "94ae 94ae 9420 9420 "
-            output += code
-            output += "942c 942c 942f 942f\n\n"
+            output += code + " "
+            output += "942f 942f\n\n"
+
             if end is not None:
-                output += f"{self._format_timestamp(end)}\t942c 942c\n\n"
+                if self.drop_frame:
+                    ts_end = self._format_timestamp_df(end)
+                else:
+                    ts_end = self._format_timestamp_ndf(end)
+
+                output += f"{ts_end}\t942c 942c\n\n"
 
         return output
 
@@ -626,7 +734,7 @@ def _text_to_code(self, s):
         for row, line in enumerate(lines):
             row += 16 - len(lines)
             # Move cursor to column 0 of the destination row
-            for _ in range(2):
+            for _ in range(1):
                 code += (
                     PAC_HIGH_BYTE_BY_ROW[row]
                     + f"{PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]} "
@@ -639,17 +747,55 @@ def _text_to_code(self, s):
         return code
 
     @staticmethod
-    def _format_timestamp(microseconds):
-        seconds_float = microseconds / 1000.0 / 1000.0
-        # Convert to non-drop-frame timecode
+    def _format_timestamp_df(us: int) -> str:
+        """
+        29.97 fps DROP-FRAME timecode (HH:MM:SS;FF) from microseconds.
+        Uses exact 30000/1001 math and the standard 10-minute block method.
+        """
+        # 1) Real frames at 30000/1001 (quantize to nearest frame)
+        total_frames = int(round(us * 30 / 1_000_000 * 1000 / 1001 + 1e-9))
+
+        # 2) Add back the dropped frames to get "timecode counting" frames
+        fps = 30
+        FRAMES_PER_10MIN = 17982  # 10*60*30 - 2*9
+        d = total_frames // FRAMES_PER_10MIN  # number of full 10-min blocks
+        m = total_frames % FRAMES_PER_10MIN  # remainder into current block
+        # Within each 10-min block, 2 frames are dropped at the top of minutes 1–9.
+        if m < 2:
+            tc_frames = total_frames + 18 * d
+        else:
+            tc_frames = total_frames + 18 * d + 2 * ((m - 2) // 1798)
+
+        # 3) Derive HH:MM:SS;FF in "timecode counting" space (30 fps)
+        hours = tc_frames // (fps * 60 * 60)
+        rem = tc_frames % (fps * 60 * 60)
+        minutes = rem // (fps * 60)
+        rem = rem % (fps * 60)
+        seconds = rem // fps
+        frames = rem % fps
+
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d};{frames:02d}"
+
+    @staticmethod
+    def _format_timestamp_ndf(microseconds: int) -> str:
+        """
+        Convert microseconds → 29.97fps non-drop-frame timecode (HH:MM:SS:FF).
+        Uses 30fps frame counting with NTSC correction factor.
+        """
+        # Convert to elapsed seconds
+        seconds_float = microseconds / 1_000_000.0
+        # Apply NTSC fudge factor to align to 29.97 fps
         seconds_float *= 1000.0 / 1001.0
+
         hours = math.floor(seconds_float / 3600)
         seconds_float -= hours * 3600
         minutes = math.floor(seconds_float / 60)
         seconds_float -= minutes * 60
         seconds = math.floor(seconds_float)
         seconds_float -= seconds
         frames = math.floor(seconds_float * 30)
+
+        # Return non-drop-frame formatted timecode (colon!)
         return f"{hours:02}:{minutes:02}:{seconds:02}:{frames:02}"