クラス設計の方針

各ステップをメソッドとして分離し、normalize() で全処理を順番に呼び出す構成にする。 メソッドを独立させることで単体テストや部分適用が容易になる。

Python — クラスの骨格
class CobolNormalizer:
    def remove_sequence_numbers(self, lines): ...   # PART 03: 行ラベル除去
    def remove_comments(self, lines): ...           # PART 03: コメント削除
    def remove_identification_area(self, lines): ...# PART 03: 識別領域除去
    def join_continuation_lines(self, lines): ...   # PART 04: 継続行結合
    def normalize_spaces(self, lines): ...          # PART 05: スペース間詰め
    def split_to_one_statement(self, lines): ...    # PART 06: 1行1命令化

    def normalize(self, source: str) -> list[str]:
        """全ステップを順に適用する。"""
        lines = source.splitlines()
        lines = self.remove_sequence_numbers(lines)
        lines = self.remove_comments(lines)
        lines = self.remove_identification_area(lines)
        lines = self.join_continuation_lines(lines)
        lines = self.normalize_spaces(lines)
        lines = self.split_to_one_statement(lines)
        return lines

CobolNormalizer — フル実装

Python — cobol_normalizer.py(完全版)
"""
cobol_normalizer.py
COBOLソースコードを正規化して1行1命令形式に変換する。
"""
import re


_END_SCOPE_PATTERN = re.compile(
    r'\b(END-IF|END-PERFORM|END-EVALUATE|END-READ|END-WRITE|'
    r'END-REWRITE|END-DELETE|END-START|END-RETURN|END-RECEIVE|'
    r'END-CALL|END-COMPUTE|END-ADD|END-SUBTRACT|END-MULTIPLY|'
    r'END-DIVIDE|END-STRING|END-UNSTRING|END-INSPECT|END-SEARCH)\b'
)
_PIC_START_PATTERN = re.compile(r'\bPICTURE\b|\bPIC\b', re.IGNORECASE)


class CobolNormalizer:
    """COBOLソースを正規化して1行1命令形式に変換するクラス。"""

    def normalize(self, source: str) -> list[str]:
        """ソース文字列を受け取り、正規化済み行リストを返す。"""
        lines = source.splitlines()
        lines = self.remove_sequence_numbers(lines)
        lines = self.remove_comments(lines)
        lines = self.remove_identification_area(lines)
        lines = self.join_continuation_lines(lines)
        lines = self.normalize_spaces(lines)
        lines = self.split_to_one_statement(lines)
        return lines

    # ── Step 1: 行ラベル除去 ──────────────────────────────────────────
    def remove_sequence_numbers(self, lines: list[str]) -> list[str]:
        result = []
        for line in lines:
            line = line.rstrip('\n').rstrip('\r').expandtabs(8)
            if len(line) <= 6:
                continue
            result.append(line[6:])
        return result

    # ── Step 2: コメント行削除 ────────────────────────────────────────
    def remove_comments(self, lines: list[str]) -> list[str]:
        result = []
        for line in lines:
            if not line:
                continue
            if line[0] in ('*', '/'):
                continue
            result.append(line)
        return result

    # ── Step 3: 識別領域除去 ──────────────────────────────────────────
    def remove_identification_area(self, lines: list[str]) -> list[str]:
        return [line[:66].rstrip() for line in lines]

    # ── Step 4: 継続行結合 ────────────────────────────────────────────
    def join_continuation_lines(self, lines: list[str]) -> list[str]:
        result = []
        for line in lines:
            if not line:
                continue
            if line[0] == '-':
                body = line[1:].lstrip()
                if not result:
                    result.append(body)
                    continue
                prev = result[-1]
                if body and body[0] in ("'", '"'):
                    qc = body[0]
                    if prev.endswith(qc):
                        prev = prev[:-1]
                    body = body[1:]
                    result[-1] = prev + body
                else:
                    result[-1] = prev.rstrip() + ' ' + body
            else:
                result.append(line)
        return result

    # ── Step 5: スペース正規化 ────────────────────────────────────────
    def normalize_spaces(self, lines: list[str]) -> list[str]:
        result = []
        for line in lines:
            squeezed = self._squeeze_spaces(line)
            if squeezed:
                result.append(squeezed)
        return result

    def _squeeze_spaces(self, line: str) -> str:
        output, in_string, quote_char, prev_space = [], False, '', False
        for ch in line:
            if in_string:
                output.append(ch)
                if ch == quote_char:
                    in_string = False
                prev_space = False
            else:
                if ch in ("'", '"'):
                    in_string, quote_char = True, ch
                    output.append(ch); prev_space = False
                elif ch == ' ':
                    if not prev_space:
                        output.append(ch)
                    prev_space = True
                else:
                    output.append(ch); prev_space = False
        return ''.join(output).strip()

    # ── Step 6: 1行1命令分割 ─────────────────────────────────────────
    def split_to_one_statement(self, lines: list[str]) -> list[str]:
        result = []
        for line in lines:
            result.extend(self._tokenize_line(line))
        return result

    def _tokenize_line(self, line: str) -> list[str]:
        STATE_NORMAL, STATE_STRING, STATE_PICTURE = 'N', 'S', 'P'
        state, quote_char = STATE_NORMAL, ''
        current, statements = [], []
        i, n = 0, len(line)

        while i < n:
            ch = line[i]

            if state == STATE_STRING:
                current.append(ch)
                if ch == quote_char:
                    state = STATE_NORMAL
                i += 1
                continue

            if state == STATE_PICTURE:
                if ch == ' ':
                    current.append(ch)
                    state = STATE_NORMAL
                else:
                    current.append(ch)
                i += 1
                continue

            # NORMAL
            if ch in ("'", '"'):
                state, quote_char = STATE_STRING, ch
                current.append(ch); i += 1; continue

            if ch == '.':
                prev_ch = current[-1] if current else ''
                next_ch = line[i+1] if i+1 < n else ''
                if prev_ch.isdigit() and next_ch.isdigit():
                    current.append(ch)
                else:
                    current.append(ch)
                    stmt = ''.join(current).strip()
                    if stmt and stmt != '.':
                        statements.append(stmt)
                    current = []
                i += 1; continue

            m = _PIC_START_PATTERN.match(line, i)
            if m:
                tok = m.group(0)
                current.extend(list(tok))
                i += len(tok)
                state = STATE_PICTURE
                continue

            current.append(ch); i += 1

        remaining = ''.join(current).strip()
        if remaining:
            statements.extend(self._split_by_end_scope(remaining))

        return [s for s in statements if s]

    def _split_by_end_scope(self, text: str) -> list[str]:
        parts, last = [], 0
        for m in _END_SCOPE_PATTERN.finditer(text):
            chunk = text[last:m.end()].strip()
            if chunk:
                parts.append(chunk)
            last = m.end()
        rem = text[last:].strip()
        if rem:
            parts.append(rem)
        return parts or [text]

動作確認デモ

Python — デモ用 COBOLスニペット
sample_cobol = """\
000100 IDENTIFICATION DIVISION.
000200* =======================================
000300* サンプルプログラム(正規化テスト用)
000400* =======================================
000500 PROGRAM-ID. NORMTEST.
000600 DATA DIVISION.
000700 WORKING-STORAGE SECTION.
000800 01 WS-MSG      PIC X(30).
000900 01 WS-AMT      PIC 9(5)V99.
001000 01 WS-FLG      PIC X(1).
001100 PROCEDURE DIVISION.
001200 MAIN-PARA.
001300     MOVE 'HELLO WOR
001400-    LD' TO WS-MSG.
001500     MOVE 1.5 TO WS-AMT.
001600     DISPLAY WS-MSG.  DISPLAY WS-AMT.
001700     IF WS-FLG = 'Y'
001800         DISPLAY 'FLAG ON'
001900     END-IF
002000     STOP RUN.
"""

normalizer = CobolNormalizer()
result = normalizer.normalize(sample_cobol)
for line in result:
    print(line)
実行結果(正規化後)
IDENTIFICATION DIVISION.
PROGRAM-ID. NORMTEST.
DATA DIVISION.
WORKING-STORAGE SECTION.
01 WS-MSG PIC X(30).
01 WS-AMT PIC 9(5)V99.
01 WS-FLG PIC X(1).
PROCEDURE DIVISION.
MAIN-PARA.
MOVE 'HELLO WORLD' TO WS-MSG.
MOVE 1.5 TO WS-AMT.
DISPLAY WS-MSG.
DISPLAY WS-AMT.
IF WS-FLG = 'Y' DISPLAY 'FLAG ON' END-IF
STOP RUN.

ファイル入出力

実際のソースファイルを読み込んで正規化し、出力ファイルに書き出す例を示す。

Python — ファイル入出力
from pathlib import Path

def normalize_file(input_path: str, output_path: str, encoding: str = 'utf-8') -> None:
    """
    COBOLソースファイルを正規化して出力ファイルに書き出す。
    encoding: 'utf-8', 'cp930'(IBM拡張EBCDIC→Shift-JIS変換後)など環境に合わせて指定。
    """
    source = Path(input_path).read_text(encoding=encoding, errors='replace')
    normalizer = CobolNormalizer()
    normalized_lines = normalizer.normalize(source)
    Path(output_path).write_text('\n'.join(normalized_lines) + '\n', encoding='utf-8')
    print(f"完了: {len(normalized_lines)} 命令 → {output_path}")


# 使用例
normalize_file('SAMPLE.CBL', 'SAMPLE_normalized.CBL')

次の章では…

PART 08 ではこのツールを使った後にできること(差分比較・grep・簡易パーサーへの接続)と、 本実装の限界について整理する。

→ PART 08 — まとめと発展へ