From 98325711d1dc032e2c9d20a2b567f73f0e87fc4c Mon Sep 17 00:00:00 2001
From: MayerTim <tim97mayer@googlemail.com>
Date: Thu, 11 Jun 2026 00:05:53 +0200
Subject: [PATCH] refactor(formatter): centralize word scanning helpers

---
 .../passes/structural/blockEndFormatting.ts   |  41 +----
 .../structural/caseExpressionFormatting.ts    |  43 +----
 .../passes/structural/cursorForFormatting.ts  |  61 +------
 .../passes/structural/exceptionFormatting.ts  |  44 +----
 .../structural/ifExpressionFormatting.ts      |  64 ++-----
 .../passes/structural/inlineIfFormatting.ts   |  73 +++-----
 .../structural/parenthesisFormatting.ts       |  26 +--
 .../structural/queryClauseFormatting.ts       |  70 +-------
 .../passes/structural/unionAllFormatting.ts   |  53 +-----
 src/formatter/sqlLineScanner.ts               | 161 ++++++++++++++++++
 10 files changed, 227 insertions(+), 409 deletions(-)

diff --git a/src/formatter/passes/structural/blockEndFormatting.ts b/src/formatter/passes/structural/blockEndFormatting.ts
index 1d02ec7..4d63794 100644
--- a/src/formatter/passes/structural/blockEndFormatting.ts
+++ b/src/formatter/passes/structural/blockEndFormatting.ts
@@ -1,5 +1,6 @@
 import type { SqlDialect } from '../../../dialects';
 import {
+  collectSqlWordsFromSegments,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
   type SqlOutsideSegment,
@@ -21,14 +22,6 @@ interface EndPhraseToken {
   readonly hasSemicolon: boolean;
 }
 
-interface WordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly normalized: string;
-}
-
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
 const BLOCK_END_FOLLOWERS = new Set(['if', 'for', 'loop', 'while', 'try', 'catch']);
 
 export function createInitialBlockEndFormattingState(): BlockEndFormattingState {
@@ -89,7 +82,7 @@ function collectBlockEndPhrases(
   line: string,
   outsideSegments: readonly SqlOutsideSegment[],
 ): EndPhraseToken[] {
-  const words = collectWords(line, outsideSegments);
+  const words = collectSqlWordsFromSegments(line, outsideSegments);
   const phrases: EndPhraseToken[] = [];
   let index = 0;
 
@@ -171,33 +164,3 @@ function consumeOptionalSemicolon(
 
   return { end: start, hasSemicolon: false };
 }
-
-function collectWords(line: string, outsideSegments: readonly SqlOutsideSegment[]): WordMatch[] {
-  const words: WordMatch[] = [];
-
-  for (const segment of outsideSegments) {
-    let index = segment.start;
-
-    while (index < segment.end) {
-      if (!SQL_WORD_START.test(line[index])) {
-        index += 1;
-        continue;
-      }
-
-      const start = index;
-      index += 1;
-
-      while (index < segment.end && SQL_WORD_PART.test(line[index])) {
-        index += 1;
-      }
-
-      words.push({
-        start,
-        end: index,
-        normalized: line.slice(start, index).toLowerCase(),
-      });
-    }
-  }
-
-  return words;
-}
diff --git a/src/formatter/passes/structural/caseExpressionFormatting.ts b/src/formatter/passes/structural/caseExpressionFormatting.ts
index 9ad531f..41b9931 100644
--- a/src/formatter/passes/structural/caseExpressionFormatting.ts
+++ b/src/formatter/passes/structural/caseExpressionFormatting.ts
@@ -1,8 +1,9 @@
 import type { SqlDialect } from '../../../dialects';
 import {
+  collectSqlWordsFromSegments,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
-  type SqlOutsideSegment,
+  type SqlWordMatch,
 } from '../../sqlLineScanner';
 
 export interface CaseExpressionFormattingState {
@@ -15,14 +16,8 @@ interface ExpandedLineResult {
   readonly nextState: CaseExpressionFormattingState;
 }
 
-interface WordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly normalized: string;
-}
+type WordMatch = SqlWordMatch;
 
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
 const CASE_SPLIT_WORDS = new Set(['when', 'then', 'else']);
 const BLOCK_END_FOLLOWERS = new Set(['for', 'if', 'loop', 'try', 'catch', 'while']);
 
@@ -46,7 +41,7 @@ export function expandWatcomCaseExpressionLine(
   initialState: CaseExpressionFormattingState,
 ): ExpandedLineResult {
   const scanResult = scanSqlLineOutsideLiteralsAndComments(line, initialState.scanState);
-  const words = collectWords(line, scanResult.outsideSegments);
+  const words = collectSqlWordsFromSegments(line, scanResult.outsideSegments);
   const nextState: CaseExpressionFormattingState = {
     scanState: scanResult.nextState,
     caseDepth: calculateNextCaseDepth(words, initialState.caseDepth),
@@ -182,33 +177,3 @@ function pushTrimmed(lines: string[], text: string): void {
     lines.push(trimmed);
   }
 }
-
-function collectWords(line: string, outsideSegments: readonly SqlOutsideSegment[]): WordMatch[] {
-  const words: WordMatch[] = [];
-
-  for (const segment of outsideSegments) {
-    let index = segment.start;
-
-    while (index < segment.end) {
-      if (!SQL_WORD_START.test(line[index])) {
-        index += 1;
-        continue;
-      }
-
-      const start = index;
-      index += 1;
-
-      while (index < segment.end && SQL_WORD_PART.test(line[index])) {
-        index += 1;
-      }
-
-      words.push({
-        start,
-        end: index,
-        normalized: line.slice(start, index).toLowerCase(),
-      });
-    }
-  }
-
-  return words;
-}
diff --git a/src/formatter/passes/structural/cursorForFormatting.ts b/src/formatter/passes/structural/cursorForFormatting.ts
index 443f3f2..f485ceb 100644
--- a/src/formatter/passes/structural/cursorForFormatting.ts
+++ b/src/formatter/passes/structural/cursorForFormatting.ts
@@ -1,9 +1,10 @@
 import type { SqlDialect } from '../../../dialects';
 import {
   cloneSqlLineScanState,
+  collectSqlWordsWithParenthesisDepth,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
-  type SqlOutsideSegment,
+  type SqlWordDepthMatch,
 } from '../../sqlLineScanner';
 
 export interface CursorForFormattingState {
@@ -16,15 +17,7 @@ interface ExpandedLineResult {
   readonly nextState: CursorForFormattingState;
 }
 
-interface WordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly normalized: string;
-  readonly depth: number;
-}
-
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
+type WordMatch = SqlWordDepthMatch;
 
 export function createInitialCursorForFormattingState(): CursorForFormattingState {
   return {
@@ -56,7 +49,7 @@ export function expandWatcomCursorForLine(
     return { lines: [line], nextState: baseNextState };
   }
 
-  const words = collectWords(line, scanResult.outsideSegments);
+  const words = collectSqlWordsWithParenthesisDepth(line, scanResult.outsideSegments);
   const { splitPoints, inCursorQuery } = findCursorForSplitPoints(
     words,
     initialState.inCursorQuery,
@@ -205,52 +198,6 @@ function splitLineAtIndexes(line: string, indexes: readonly number[]): string[]
   return lines.length > 0 ? lines : [line];
 }
 
-function collectWords(line: string, outsideSegments: readonly SqlOutsideSegment[]): WordMatch[] {
-  const words: WordMatch[] = [];
-  let depth = 0;
-
-  for (const segment of outsideSegments) {
-    let index = segment.start;
-
-    while (index < segment.end) {
-      const char = line[index];
-
-      if (char === '(') {
-        depth += 1;
-        index += 1;
-        continue;
-      }
-
-      if (char === ')') {
-        depth = Math.max(0, depth - 1);
-        index += 1;
-        continue;
-      }
-
-      if (!SQL_WORD_START.test(char)) {
-        index += 1;
-        continue;
-      }
-
-      const start = index;
-      index += 1;
-
-      while (index < segment.end && SQL_WORD_PART.test(line[index])) {
-        index += 1;
-      }
-
-      words.push({
-        start,
-        end: index,
-        normalized: line.slice(start, index).toLowerCase(),
-        depth,
-      });
-    }
-  }
-
-  return words;
-}
-
 function pushTrimmed(lines: string[], value: string): void {
   const trimmed = value.trim();
 
diff --git a/src/formatter/passes/structural/exceptionFormatting.ts b/src/formatter/passes/structural/exceptionFormatting.ts
index 701f008..3af7ffa 100644
--- a/src/formatter/passes/structural/exceptionFormatting.ts
+++ b/src/formatter/passes/structural/exceptionFormatting.ts
@@ -1,8 +1,9 @@
 import type { SqlDialect } from '../../../dialects';
 import {
+  collectSqlWordsFromSegments,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
-  type SqlOutsideSegment,
+  type SqlWordMatch,
 } from '../../sqlLineScanner';
 
 export interface ExceptionFormattingState {
@@ -15,14 +16,7 @@ interface ExpandedLineResult {
   readonly nextState: ExceptionFormattingState;
 }
 
-interface WordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly normalized: string;
-}
-
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
+type WordMatch = SqlWordMatch;
 
 export function createInitialExceptionFormattingState(): ExceptionFormattingState {
   return {
@@ -45,7 +39,7 @@ export function expandWatcomExceptionLine(
   initialState: ExceptionFormattingState,
 ): ExpandedLineResult {
   const scanResult = scanSqlLineOutsideLiteralsAndComments(line, initialState.scanState);
-  const words = collectWords(line, scanResult.outsideSegments);
+  const words = collectSqlWordsFromSegments(line, scanResult.outsideSegments);
   const nextState: ExceptionFormattingState = {
     scanState: scanResult.nextState,
     inExceptionSection: calculateNextExceptionSectionState(words, initialState.inExceptionSection),
@@ -187,33 +181,3 @@ function pushTrimmed(lines: string[], text: string): void {
     lines.push(trimmed);
   }
 }
-
-function collectWords(line: string, outsideSegments: readonly SqlOutsideSegment[]): WordMatch[] {
-  const words: WordMatch[] = [];
-
-  for (const segment of outsideSegments) {
-    let index = segment.start;
-
-    while (index < segment.end) {
-      if (!SQL_WORD_START.test(line[index])) {
-        index += 1;
-        continue;
-      }
-
-      const start = index;
-      index += 1;
-
-      while (index < segment.end && SQL_WORD_PART.test(line[index])) {
-        index += 1;
-      }
-
-      words.push({
-        start,
-        end: index,
-        normalized: line.slice(start, index).toLowerCase(),
-      });
-    }
-  }
-
-  return words;
-}
diff --git a/src/formatter/passes/structural/ifExpressionFormatting.ts b/src/formatter/passes/structural/ifExpressionFormatting.ts
index 43d2a12..5327533 100644
--- a/src/formatter/passes/structural/ifExpressionFormatting.ts
+++ b/src/formatter/passes/structural/ifExpressionFormatting.ts
@@ -1,9 +1,12 @@
 import type { SqlDialect } from '../../../dialects';
 import {
   cloneSqlLineScanState,
+  collectSqlWordsFromSegments,
+  findNextSqlWord,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
   type SqlOutsideSegment,
+  type SqlWordMatch,
 } from '../../sqlLineScanner';
 
 export interface IfExpressionFormattingState {
@@ -24,11 +27,7 @@ interface ExpandedLineResult {
   readonly nextState: IfExpressionFormattingState;
 }
 
-interface WordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly normalized: string;
-}
+type WordMatch = SqlWordMatch;
 
 interface EndIfMatch {
   readonly start: number;
@@ -36,8 +35,6 @@ interface EndIfMatch {
   readonly suffix: string;
 }
 
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
 const PROCEDURAL_BRANCH_STARTERS = new Set([
   'alter',
   'begin',
@@ -389,7 +386,7 @@ function findKeyword(
     let index = Math.max(segment.start, startIndex);
 
     while (index < segment.end) {
-      const word = readNextWord(line, index, segment.end);
+      const word = findNextSqlWord(line, index, segment.end);
 
       if (!word) {
         break;
@@ -415,7 +412,7 @@ function findEndIf(
     let index = Math.max(segment.start, startIndex);
 
     while (index < segment.end) {
-      const firstWord = readNextWord(line, index, segment.end);
+      const firstWord = findNextSqlWord(line, index, segment.end);
 
       if (!firstWord) {
         break;
@@ -430,7 +427,7 @@ function findEndIf(
       }
 
       if (firstWord.normalized === 'end') {
-        const secondWord = readNextWord(line, firstWord.end, segment.end);
+        const secondWord = findNextSqlWord(line, firstWord.end, segment.end);
 
         if (secondWord?.normalized === 'if') {
           return {
@@ -448,48 +445,11 @@ function findEndIf(
   return undefined;
 }
 
-function readNextWord(line: string, startIndex: number, endIndex: number): WordMatch | undefined {
-  for (let index = startIndex; index < endIndex; index += 1) {
-    if (!SQL_WORD_START.test(line[index])) {
-      continue;
-    }
-
-    const start = index;
-    index += 1;
-
-    while (index < endIndex && SQL_WORD_PART.test(line[index])) {
-      index += 1;
-    }
-
-    return {
-      start,
-      end: index,
-      normalized: line.slice(start, index).toLowerCase(),
-    };
-  }
-
-  return undefined;
-}
-
-function collectWords(line: string, outsideSegments: readonly SqlOutsideSegment[]): WordMatch[] {
-  const words: WordMatch[] = [];
-
-  for (const segment of outsideSegments) {
-    let index = segment.start;
-
-    while (index < segment.end) {
-      const word = readNextWord(line, index, segment.end);
-
-      if (!word) {
-        break;
-      }
-
-      words.push(word);
-      index = word.end;
-    }
-  }
-
-  return words;
+function collectWords(
+  line: string,
+  outsideSegments: readonly SqlOutsideSegment[],
+): readonly WordMatch[] {
+  return collectSqlWordsFromSegments(line, outsideSegments);
 }
 
 function isProceduralBranchText(text: string): boolean {
diff --git a/src/formatter/passes/structural/inlineIfFormatting.ts b/src/formatter/passes/structural/inlineIfFormatting.ts
index 3abe964..af934ec 100644
--- a/src/formatter/passes/structural/inlineIfFormatting.ts
+++ b/src/formatter/passes/structural/inlineIfFormatting.ts
@@ -2,9 +2,13 @@ import type { SqlDialect } from '../../../dialects';
 import {
   cloneSqlLineScanState,
   createSqlOutsideLookup,
+  findNextSqlWord,
+  isSqlWordStart,
+  readSqlWordAt,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
   type SqlOutsideSegment,
+  type SqlWordMatch,
 } from '../../sqlLineScanner';
 
 interface ExpandedLineResult {
@@ -12,14 +16,7 @@ interface ExpandedLineResult {
   readonly nextState: SqlLineScanState;
 }
 
-interface KeywordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly text: string;
-}
-
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
+type KeywordMatch = SqlWordMatch;
 
 /**
  * Normalizes compact Watcom IF statements before the line-based formatter applies indentation.
@@ -67,7 +64,7 @@ function startsWithIfOutsideComments(
     }
 
     const word = readWordAt(line, firstContentIndex);
-    return word?.text.toLowerCase() === 'if';
+    return word?.normalized === 'if';
   });
 }
 
@@ -83,7 +80,7 @@ function tryExpandInlineIf(
 
   const ifMatch = readWordAt(line, firstContentIndex);
 
-  if (!ifMatch || ifMatch.text.toLowerCase() !== 'if') {
+  if (!ifMatch || ifMatch.normalized !== 'if') {
     return undefined;
   }
 
@@ -196,7 +193,7 @@ function splitConditionOnLogicalOperators(condition: string): string[] {
       continue;
     }
 
-    if (!SQL_WORD_START.test(current)) {
+    if (!isSqlWordStart(current)) {
       index += 1;
       continue;
     }
@@ -208,7 +205,7 @@ function splitConditionOnLogicalOperators(condition: string): string[] {
       continue;
     }
 
-    const normalized = word.text.toLowerCase();
+    const normalized = word.normalized;
 
     if (parenDepth === 0 && (normalized === 'and' || normalized === 'or')) {
       if (normalized === 'and' && betweenPending) {
@@ -249,19 +246,13 @@ function findKeyword(
     let index = Math.max(segment.start, startIndex);
 
     while (index < segment.end) {
-      const nextIndex = findWordStart(line, index, segment.end);
-
-      if (nextIndex < 0) {
-        break;
-      }
-
-      const word = readWordAt(line, nextIndex);
+      const word = findNextSqlWord(line, index, segment.end);
 
-      if (!word || word.end > segment.end) {
+      if (!word) {
         break;
       }
 
-      if (word.text.toLowerCase() === keyword) {
+      if (word.normalized === keyword) {
         return word;
       }
 
@@ -281,34 +272,28 @@ function findEndIf(
     let index = Math.max(segment.start, startIndex);
 
     while (index < segment.end) {
-      const nextIndex = findWordStart(line, index, segment.end);
+      const firstWord = findNextSqlWord(line, index, segment.end);
 
-      if (nextIndex < 0) {
+      if (!firstWord) {
         break;
       }
 
-      const firstWord = readWordAt(line, nextIndex);
-
-      if (!firstWord || firstWord.end > segment.end) {
-        break;
-      }
-
-      const normalized = firstWord.text.toLowerCase();
+      const normalized = firstWord.normalized;
 
       if (normalized === 'endif') {
         return firstWord;
       }
 
       if (normalized === 'end') {
-        const secondWordStart = findWordStart(line, firstWord.end, segment.end);
-        const secondWord = secondWordStart >= 0 ? readWordAt(line, secondWordStart) : undefined;
+        const secondWord = findNextSqlWord(line, firstWord.end, segment.end);
 
-        if (secondWord && secondWord.end <= segment.end && secondWord.text.toLowerCase() === 'if') {
+        if (secondWord?.normalized === 'if') {
           const trailingSemicolonEnd = consumeOptionalSemicolon(line, secondWord.end, segment.end);
           return {
             start: firstWord.start,
             end: trailingSemicolonEnd,
             text: line.slice(firstWord.start, trailingSemicolonEnd),
+            normalized: line.slice(firstWord.start, trailingSemicolonEnd).toLowerCase(),
           };
         }
       }
@@ -334,26 +319,6 @@ function consumeOptionalSemicolon(line: string, start: number, segmentEnd: numbe
   return line[index] === ';' ? index + 1 : start;
 }
 
-function findWordStart(line: string, start: number, end: number): number {
-  for (let index = start; index < end; index += 1) {
-    if (SQL_WORD_START.test(line[index])) {
-      return index;
-    }
-  }
-
-  return -1;
-}
-
 function readWordAt(line: string, start: number): KeywordMatch | undefined {
-  if (!SQL_WORD_START.test(line[start] ?? '')) {
-    return undefined;
-  }
-
-  let end = start + 1;
-
-  while (end < line.length && SQL_WORD_PART.test(line[end])) {
-    end += 1;
-  }
-
-  return { start, end, text: line.slice(start, end) };
+  return readSqlWordAt(line, start);
 }
diff --git a/src/formatter/passes/structural/parenthesisFormatting.ts b/src/formatter/passes/structural/parenthesisFormatting.ts
index c5e286d..3dee221 100644
--- a/src/formatter/passes/structural/parenthesisFormatting.ts
+++ b/src/formatter/passes/structural/parenthesisFormatting.ts
@@ -1,6 +1,7 @@
 import {
   cloneSqlLineScanState,
   createSqlOutsideLookup,
+  readSqlWordBefore,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
 } from '../../sqlLineScanner';
@@ -21,7 +22,6 @@ export interface ParenthesisIndentAnalysis {
   readonly nextScanState: SqlLineScanState;
 }
 
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
 const TYPE_LENGTH_WORDS = new Set([
   'binary',
   'bit',
@@ -340,28 +340,8 @@ function isTypeLengthParenthesis(line: string, openIndex: number, inner: string)
     return false;
   }
 
-  const previousWord = readWordBefore(line, openIndex);
-  return previousWord !== undefined && TYPE_LENGTH_WORDS.has(previousWord.toLowerCase());
-}
-
-function readWordBefore(line: string, index: number): string | undefined {
-  let cursor = index - 1;
-
-  while (cursor >= 0 && /\s/u.test(line[cursor])) {
-    cursor -= 1;
-  }
-
-  const end = cursor + 1;
-
-  while (cursor >= 0 && SQL_WORD_PART.test(line[cursor])) {
-    cursor -= 1;
-  }
-
-  if (end === cursor + 1) {
-    return undefined;
-  }
-
-  return line.slice(cursor + 1, end);
+  const previousWord = readSqlWordBefore(line, openIndex);
+  return previousWord !== undefined && TYPE_LENGTH_WORDS.has(previousWord.normalized);
 }
 
 function skipWhitespace(line: string, start: number): number {
diff --git a/src/formatter/passes/structural/queryClauseFormatting.ts b/src/formatter/passes/structural/queryClauseFormatting.ts
index ffd925c..968c00a 100644
--- a/src/formatter/passes/structural/queryClauseFormatting.ts
+++ b/src/formatter/passes/structural/queryClauseFormatting.ts
@@ -2,9 +2,10 @@ import type { SqlDialect } from '../../../dialects';
 import {
   cloneSqlLineScanState,
   createSqlOutsideLookup,
+  collectSqlWordsWithParenthesisDepth,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
-  type SqlOutsideSegment,
+  type SqlWordDepthMatch,
 } from '../../sqlLineScanner';
 
 export interface QueryClauseFormattingState {
@@ -17,15 +18,8 @@ interface ExpandedLineResult {
   readonly nextState: QueryClauseFormattingState;
 }
 
-interface WordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly normalized: string;
-  readonly depth: number;
-}
+type WordMatch = SqlWordDepthMatch;
 
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
 const JOIN_PREFIXES = new Set(['cross', 'full', 'inner', 'left', 'right']);
 const LOGICAL_CLAUSE_STARTERS = new Set(['where', 'on', 'having', 'and', 'or']);
 
@@ -59,7 +53,11 @@ export function expandWatcomQueryClauseLine(
     return { lines: [line], nextState };
   }
 
-  const words = collectWords(line, scanResult.outsideSegments, initialState.parenthesisDepth);
+  const words = collectSqlWordsWithParenthesisDepth(
+    line,
+    scanResult.outsideSegments,
+    initialState.parenthesisDepth,
+  );
 
   if (words.length === 0) {
     return { lines: [line], nextState };
@@ -144,7 +142,7 @@ function splitLogicalContinuations(line: string): string[] {
     line,
     cloneSqlLineScanState({ inBlockComment: false }),
   );
-  const words = collectWords(line, scanResult.outsideSegments, 0);
+  const words = collectSqlWordsWithParenthesisDepth(line, scanResult.outsideSegments);
   const firstWord = words[0];
 
   if (!firstWord || firstWord.depth !== 0 || !LOGICAL_CLAUSE_STARTERS.has(firstWord.normalized)) {
@@ -197,56 +195,6 @@ function splitLineAtIndexes(line: string, indexes: readonly number[]): string[]
   return lines.length > 0 ? lines : [line];
 }
 
-function collectWords(
-  line: string,
-  outsideSegments: readonly SqlOutsideSegment[],
-  initialDepth: number,
-): WordMatch[] {
-  const words: WordMatch[] = [];
-  let depth = initialDepth;
-
-  for (const segment of outsideSegments) {
-    let index = segment.start;
-
-    while (index < segment.end) {
-      const char = line[index];
-
-      if (char === '(') {
-        depth += 1;
-        index += 1;
-        continue;
-      }
-
-      if (char === ')') {
-        depth = Math.max(0, depth - 1);
-        index += 1;
-        continue;
-      }
-
-      if (!SQL_WORD_START.test(char)) {
-        index += 1;
-        continue;
-      }
-
-      const start = index;
-      index += 1;
-
-      while (index < segment.end && SQL_WORD_PART.test(line[index])) {
-        index += 1;
-      }
-
-      words.push({
-        start,
-        end: index,
-        normalized: line.slice(start, index).toLowerCase(),
-        depth,
-      });
-    }
-  }
-
-  return words;
-}
-
 function updateParenthesisDepth(
   line: string,
   outside: readonly boolean[],
diff --git a/src/formatter/passes/structural/unionAllFormatting.ts b/src/formatter/passes/structural/unionAllFormatting.ts
index 29b0caf..ed886aa 100644
--- a/src/formatter/passes/structural/unionAllFormatting.ts
+++ b/src/formatter/passes/structural/unionAllFormatting.ts
@@ -1,8 +1,10 @@
 import {
   cloneSqlLineScanState,
+  findNextSqlWord,
   scanSqlLineOutsideLiteralsAndComments,
   type SqlLineScanState,
   type SqlOutsideSegment,
+  type SqlWordMatch,
 } from '../../sqlLineScanner';
 
 interface ExpandedLineResult {
@@ -10,14 +12,7 @@ interface ExpandedLineResult {
   readonly nextState: SqlLineScanState;
 }
 
-interface KeywordMatch {
-  readonly start: number;
-  readonly end: number;
-  readonly text: string;
-}
-
-const SQL_WORD_START = /[A-Za-z_]/u;
-const SQL_WORD_PART = /[A-Za-z0-9_$#]/u;
+type KeywordMatch = SqlWordMatch;
 
 /**
  * Keeps UNION ALL as its own physical SQL line.
@@ -60,31 +55,25 @@ function findUnionAllMatches(
     let index = segment.start;
 
     while (index < segment.end) {
-      const unionStart = findWordStart(line, index, segment.end);
+      const unionWord = findNextSqlWord(line, index, segment.end);
 
-      if (unionStart < 0) {
+      if (!unionWord) {
         break;
       }
 
-      const unionWord = readWordAt(line, unionStart);
-
-      if (!unionWord || unionWord.end > segment.end) {
-        break;
-      }
-
-      if (unionWord.text.toLowerCase() !== 'union') {
+      if (unionWord.normalized !== 'union') {
         index = unionWord.end;
         continue;
       }
 
-      const allStart = findWordStart(line, unionWord.end, segment.end);
-      const allWord = allStart >= 0 ? readWordAt(line, allStart) : undefined;
+      const allWord = findNextSqlWord(line, unionWord.end, segment.end);
 
-      if (allWord && allWord.end <= segment.end && allWord.text.toLowerCase() === 'all') {
+      if (allWord?.normalized === 'all') {
         matches.push({
           start: unionWord.start,
           end: allWord.end,
           text: line.slice(unionWord.start, allWord.end),
+          normalized: line.slice(unionWord.start, allWord.end).toLowerCase(),
         });
         index = allWord.end;
         continue;
@@ -119,27 +108,3 @@ function pushTrimmed(lines: string[], value: string): void {
     lines.push(trimmed);
   }
 }
-
-function findWordStart(line: string, start: number, end: number): number {
-  for (let index = start; index < end; index += 1) {
-    if (SQL_WORD_START.test(line[index])) {
-      return index;
-    }
-  }
-
-  return -1;
-}
-
-function readWordAt(line: string, start: number): KeywordMatch | undefined {
-  if (!SQL_WORD_START.test(line[start] ?? '')) {
-    return undefined;
-  }
-
-  let end = start + 1;
-
-  while (end < line.length && SQL_WORD_PART.test(line[end])) {
-    end += 1;
-  }
-
-  return { start, end, text: line.slice(start, end) };
-}
diff --git a/src/formatter/sqlLineScanner.ts b/src/formatter/sqlLineScanner.ts
index 8b6bd69..e9edf09 100644
--- a/src/formatter/sqlLineScanner.ts
+++ b/src/formatter/sqlLineScanner.ts
@@ -12,6 +12,167 @@ export interface SqlLineScanResult {
   readonly nextState: SqlLineScanState;
 }
 
+export interface SqlWordMatch {
+  readonly start: number;
+  readonly end: number;
+  readonly text: string;
+  readonly normalized: string;
+}
+
+export interface SqlWordDepthMatch extends SqlWordMatch {
+  readonly depth: number;
+}
+
+const SQL_WORD_START_PATTERN = /[A-Za-z_]/u;
+const SQL_WORD_PART_PATTERN = /[A-Za-z0-9_$#]/u;
+
+export function isSqlWordStart(character: string | undefined): boolean {
+  return character !== undefined && SQL_WORD_START_PATTERN.test(character);
+}
+
+export function isSqlWordPart(character: string | undefined): boolean {
+  return character !== undefined && SQL_WORD_PART_PATTERN.test(character);
+}
+
+export function readSqlWordAt(
+  line: string,
+  start: number,
+  endIndex = line.length,
+): SqlWordMatch | undefined {
+  if (!isSqlWordStart(line[start])) {
+    return undefined;
+  }
+
+  let end = start + 1;
+
+  while (end < line.length && isSqlWordPart(line[end])) {
+    end += 1;
+  }
+
+  if (end > endIndex) {
+    return undefined;
+  }
+
+  const text = line.slice(start, end);
+
+  return {
+    start,
+    end,
+    text,
+    normalized: text.toLowerCase(),
+  };
+}
+
+export function readSqlWordBefore(line: string, index: number): SqlWordMatch | undefined {
+  let cursor = index - 1;
+
+  while (cursor >= 0 && /\s/u.test(line[cursor])) {
+    cursor -= 1;
+  }
+
+  const end = cursor + 1;
+
+  while (cursor >= 0 && isSqlWordPart(line[cursor])) {
+    cursor -= 1;
+  }
+
+  const start = cursor + 1;
+
+  if (start === end || !isSqlWordStart(line[start])) {
+    return undefined;
+  }
+
+  const text = line.slice(start, end);
+
+  return {
+    start,
+    end,
+    text,
+    normalized: text.toLowerCase(),
+  };
+}
+
+export function findNextSqlWord(
+  line: string,
+  startIndex: number,
+  endIndex: number,
+): SqlWordMatch | undefined {
+  for (let index = startIndex; index < endIndex; index += 1) {
+    const word = readSqlWordAt(line, index, endIndex);
+
+    if (word) {
+      return word;
+    }
+  }
+
+  return undefined;
+}
+
+export function collectSqlWordsFromSegments(
+  line: string,
+  outsideSegments: readonly SqlOutsideSegment[],
+): readonly SqlWordMatch[] {
+  const words: SqlWordMatch[] = [];
+
+  for (const segment of outsideSegments) {
+    let index = segment.start;
+
+    while (index < segment.end) {
+      const word = findNextSqlWord(line, index, segment.end);
+
+      if (!word) {
+        break;
+      }
+
+      words.push(word);
+      index = word.end;
+    }
+  }
+
+  return words;
+}
+
+export function collectSqlWordsWithParenthesisDepth(
+  line: string,
+  outsideSegments: readonly SqlOutsideSegment[],
+  initialDepth = 0,
+): readonly SqlWordDepthMatch[] {
+  const words: SqlWordDepthMatch[] = [];
+  let depth = initialDepth;
+
+  for (const segment of outsideSegments) {
+    let index = segment.start;
+
+    while (index < segment.end) {
+      const char = line[index];
+
+      if (char === '(') {
+        depth += 1;
+        index += 1;
+        continue;
+      }
+
+      if (char === ')') {
+        depth = Math.max(0, depth - 1);
+        index += 1;
+        continue;
+      }
+
+      const word = readSqlWordAt(line, index, segment.end);
+
+      if (!word) {
+        index += 1;
+        continue;
+      }
+
+      words.push({ ...word, depth });
+      index = word.end;
+    }
+  }
+
+  return words;
+}
+
 export function createInitialSqlLineScanState(): SqlLineScanState {
   return { inBlockComment: false };
 }