mxnix · goddammit1 · Jun 18, 2026 · Jun 18, 2026 · gemini-code-assist · Jun 18, 2026
diff --git a/lib/proxy/openai/openai_request_parser.dart b/lib/proxy/openai/openai_request_parser.dart
@@ -146,12 +146,11 @@ class OpenAiRequestParser {
       throw const FormatException('`messages` must be an array.');
     }
 
-    final leadingSystemParts = <String>[];
+    final systemParts = <String>[];
     final turns = <UnifiedTurn>[];
     final toolDeclarations = _parseTools(json['tools']);
     final toolCallNames = <String, String>{};
     final shouldIgnoreReasoningPrefill = _isGeminiModel(model);
-    var seenNonSystemMessage = false;
 
     for (final rawMessage in messages) {
       if (rawMessage is! Map) {
@@ -164,16 +163,23 @@ class OpenAiRequestParser {
         if (text.isEmpty) {
           continue;
         }
-        if (!seenNonSystemMessage) {
-          leadingSystemParts.add(text);
-        } else {
-          turns.add(UnifiedTurn(role: 'user', parts: [UnifiedPart.text(text)]));
-        }
+        // System/developer messages can appear anywhere in the request. Clients
+        // like SillyTavern frequently inject large instruction blocks (thinking
+        // protocols, jailbreaks, "post-history instructions") *after* the real
+        // history and even after the user's latest message.
+        //
+        // These must NOT become a trailing user turn, nor be appended to the
+        // last user turn: doing so buries the genuine (often short) user
+        // question under a huge instruction block, and the model ends up
+        // responding to the instruction instead of the actual latest message.
+        //
+        // Instead, route every system/developer message into the system
+        // instruction. This keeps the real last user message as the final turn
+        // while still delivering the instructions where models expect them.
+        systemParts.add(text);
         continue;
       }
 
-      seenNonSystemMessage = true;
-
       if (role == 'assistant' &&
           shouldIgnoreReasoningPrefill &&
           _isStandaloneReasoningPrefill(message['content']) &&
@@ -246,7 +252,7 @@ class OpenAiRequestParser {
     final jsonSchema = _readMapValue(responseFormat?['json_schema'], 'response_format.json_schema');
     final googleWebSearchEnabled = _parseGoogleWebSearchEnabled(json);
     final kiroServerToolsEnabled = _parseKiroServerToolsEnabled(json);
-    final mergedSystemInstruction = leadingSystemParts.join('\n\n').trim();
+    final mergedSystemInstruction = systemParts.join('\n\n').trim();
     var systemInstruction = mergedSystemInstruction.isEmpty ? null : mergedSystemInstruction;
     if (turns.isEmpty && systemInstruction != null) {
       turns.add(UnifiedTurn(role: 'user', parts: [UnifiedPart.text(systemInstruction)]));

diff --git a/test/openai_request_parser_test.dart b/test/openai_request_parser_test.dart
@@ -243,7 +243,7 @@ void main() {
     expect(functionCall.thoughtSignature, 'sig_read_file');
   });
 
-  test('keeps only leading system and developer notes in system instruction', () {
+  test('routes all system and developer messages into system instruction', () {
     final request = OpenAiRequestParser.parseChatRequest({
       'model': 'gemini-2.5-pro',
       'messages': [
@@ -254,12 +254,14 @@ void main() {
       ],
     }, requestId: 'req_late_system');
 
-    expect(request.systemInstruction, 'Lead instruction.');
-    expect(request.turns, hasLength(3));
+    // Every system/developer message (leading or mid-conversation) is routed
+    // into the system instruction. This keeps the real conversation turns
+    // intact and prevents an injected instruction from becoming a trailing turn
+    // that the model would answer instead of the actual latest message.
+    expect(request.systemInstruction, 'Lead instruction.\n\nLate policy update.');
+    expect(request.turns, hasLength(2));
     expect(request.turns.first.role, 'user');
     expect(request.turns.first.parts.single.text, 'Hello');
-    expect(request.turns[1].role, 'user');
-    expect(request.turns[1].parts.single.text, 'Late policy update.');
     expect(request.turns.last.role, 'assistant');
     expect(request.turns.last.parts.single.text, 'Hi there');
   });
@@ -275,12 +277,16 @@ void main() {
       ],
     }, requestId: 'req_trailing_system_after_assistant');
 
-    expect(request.systemInstruction, 'You are a roleplay director.');
-    expect(request.turns, hasLength(2));
-    expect(request.turns[0].role, 'assistant');
-    expect(request.turns[0].parts.single.text, 'Previous character reply.');
-    expect(request.turns[1].role, 'user');
-    expect(request.turns[1].parts.single.text, 'Pause roleplay and write the memory book.');
+    // The standalone "<think>" prefill is dropped for Gemini, and the trailing
+    // system instruction is merged into the system instruction rather than
+    // becoming a user turn. Only the genuine assistant reply remains as a turn.
+    expect(
+      request.systemInstruction,
+      'You are a roleplay director.\n\nPause roleplay and write the memory book.',
+    );
+    expect(request.turns, hasLength(1));
+    expect(request.turns.single.role, 'assistant');
+    expect(request.turns.single.parts.single.text, 'Previous character reply.');
   });
 
   test('preserves standalone reasoning prefills for non-Gemini models', () {