クラス設計の方針
各ステップをメソッドとして分離し、normalize() で全処理を順番に呼び出す構成にする。
メソッドを独立させることで単体テストや部分適用が容易になる。
class CobolNormalizer:
def remove_sequence_numbers(self, lines): ... # PART 03: 行ラベル除去
def remove_comments(self, lines): ... # PART 03: コメント削除
def remove_identification_area(self, lines): ...# PART 03: 識別領域除去
def join_continuation_lines(self, lines): ... # PART 04: 継続行結合
def normalize_spaces(self, lines): ... # PART 05: スペース間詰め
def split_to_one_statement(self, lines): ... # PART 06: 1行1命令化
def normalize(self, source: str) -> list[str]:
"""全ステップを順に適用する。"""
lines = source.splitlines()
lines = self.remove_sequence_numbers(lines)
lines = self.remove_comments(lines)
lines = self.remove_identification_area(lines)
lines = self.join_continuation_lines(lines)
lines = self.normalize_spaces(lines)
lines = self.split_to_one_statement(lines)
return lines
CobolNormalizer — フル実装
"""
cobol_normalizer.py
COBOLソースコードを正規化して1行1命令形式に変換する。
"""
import re
_END_SCOPE_PATTERN = re.compile(
r'\b(END-IF|END-PERFORM|END-EVALUATE|END-READ|END-WRITE|'
r'END-REWRITE|END-DELETE|END-START|END-RETURN|END-RECEIVE|'
r'END-CALL|END-COMPUTE|END-ADD|END-SUBTRACT|END-MULTIPLY|'
r'END-DIVIDE|END-STRING|END-UNSTRING|END-INSPECT|END-SEARCH)\b'
)
_PIC_START_PATTERN = re.compile(r'\bPICTURE\b|\bPIC\b', re.IGNORECASE)
class CobolNormalizer:
"""COBOLソースを正規化して1行1命令形式に変換するクラス。"""
def normalize(self, source: str) -> list[str]:
"""ソース文字列を受け取り、正規化済み行リストを返す。"""
lines = source.splitlines()
lines = self.remove_sequence_numbers(lines)
lines = self.remove_comments(lines)
lines = self.remove_identification_area(lines)
lines = self.join_continuation_lines(lines)
lines = self.normalize_spaces(lines)
lines = self.split_to_one_statement(lines)
return lines
# ── Step 1: 行ラベル除去 ──────────────────────────────────────────
def remove_sequence_numbers(self, lines: list[str]) -> list[str]:
result = []
for line in lines:
line = line.rstrip('\n').rstrip('\r').expandtabs(8)
if len(line) <= 6:
continue
result.append(line[6:])
return result
# ── Step 2: コメント行削除 ────────────────────────────────────────
def remove_comments(self, lines: list[str]) -> list[str]:
result = []
for line in lines:
if not line:
continue
if line[0] in ('*', '/'):
continue
result.append(line)
return result
# ── Step 3: 識別領域除去 ──────────────────────────────────────────
def remove_identification_area(self, lines: list[str]) -> list[str]:
return [line[:66].rstrip() for line in lines]
# ── Step 4: 継続行結合 ────────────────────────────────────────────
def join_continuation_lines(self, lines: list[str]) -> list[str]:
result = []
for line in lines:
if not line:
continue
if line[0] == '-':
body = line[1:].lstrip()
if not result:
result.append(body)
continue
prev = result[-1]
if body and body[0] in ("'", '"'):
qc = body[0]
if prev.endswith(qc):
prev = prev[:-1]
body = body[1:]
result[-1] = prev + body
else:
result[-1] = prev.rstrip() + ' ' + body
else:
result.append(line)
return result
# ── Step 5: スペース正規化 ────────────────────────────────────────
def normalize_spaces(self, lines: list[str]) -> list[str]:
result = []
for line in lines:
squeezed = self._squeeze_spaces(line)
if squeezed:
result.append(squeezed)
return result
def _squeeze_spaces(self, line: str) -> str:
output, in_string, quote_char, prev_space = [], False, '', False
for ch in line:
if in_string:
output.append(ch)
if ch == quote_char:
in_string = False
prev_space = False
else:
if ch in ("'", '"'):
in_string, quote_char = True, ch
output.append(ch); prev_space = False
elif ch == ' ':
if not prev_space:
output.append(ch)
prev_space = True
else:
output.append(ch); prev_space = False
return ''.join(output).strip()
# ── Step 6: 1行1命令分割 ─────────────────────────────────────────
def split_to_one_statement(self, lines: list[str]) -> list[str]:
result = []
for line in lines:
result.extend(self._tokenize_line(line))
return result
def _tokenize_line(self, line: str) -> list[str]:
STATE_NORMAL, STATE_STRING, STATE_PICTURE = 'N', 'S', 'P'
state, quote_char = STATE_NORMAL, ''
current, statements = [], []
i, n = 0, len(line)
while i < n:
ch = line[i]
if state == STATE_STRING:
current.append(ch)
if ch == quote_char:
state = STATE_NORMAL
i += 1
continue
if state == STATE_PICTURE:
if ch == ' ':
current.append(ch)
state = STATE_NORMAL
else:
current.append(ch)
i += 1
continue
# NORMAL
if ch in ("'", '"'):
state, quote_char = STATE_STRING, ch
current.append(ch); i += 1; continue
if ch == '.':
prev_ch = current[-1] if current else ''
next_ch = line[i+1] if i+1 < n else ''
if prev_ch.isdigit() and next_ch.isdigit():
current.append(ch)
else:
current.append(ch)
stmt = ''.join(current).strip()
if stmt and stmt != '.':
statements.append(stmt)
current = []
i += 1; continue
m = _PIC_START_PATTERN.match(line, i)
if m:
tok = m.group(0)
current.extend(list(tok))
i += len(tok)
state = STATE_PICTURE
continue
current.append(ch); i += 1
remaining = ''.join(current).strip()
if remaining:
statements.extend(self._split_by_end_scope(remaining))
return [s for s in statements if s]
def _split_by_end_scope(self, text: str) -> list[str]:
parts, last = [], 0
for m in _END_SCOPE_PATTERN.finditer(text):
chunk = text[last:m.end()].strip()
if chunk:
parts.append(chunk)
last = m.end()
rem = text[last:].strip()
if rem:
parts.append(rem)
return parts or [text]
動作確認デモ
sample_cobol = """\
000100 IDENTIFICATION DIVISION.
000200* =======================================
000300* サンプルプログラム(正規化テスト用)
000400* =======================================
000500 PROGRAM-ID. NORMTEST.
000600 DATA DIVISION.
000700 WORKING-STORAGE SECTION.
000800 01 WS-MSG PIC X(30).
000900 01 WS-AMT PIC 9(5)V99.
001000 01 WS-FLG PIC X(1).
001100 PROCEDURE DIVISION.
001200 MAIN-PARA.
001300 MOVE 'HELLO WOR
001400- LD' TO WS-MSG.
001500 MOVE 1.5 TO WS-AMT.
001600 DISPLAY WS-MSG. DISPLAY WS-AMT.
001700 IF WS-FLG = 'Y'
001800 DISPLAY 'FLAG ON'
001900 END-IF
002000 STOP RUN.
"""
normalizer = CobolNormalizer()
result = normalizer.normalize(sample_cobol)
for line in result:
print(line)
実行結果(正規化後)
IDENTIFICATION DIVISION. PROGRAM-ID. NORMTEST. DATA DIVISION. WORKING-STORAGE SECTION. 01 WS-MSG PIC X(30). 01 WS-AMT PIC 9(5)V99. 01 WS-FLG PIC X(1). PROCEDURE DIVISION. MAIN-PARA. MOVE 'HELLO WORLD' TO WS-MSG. MOVE 1.5 TO WS-AMT. DISPLAY WS-MSG. DISPLAY WS-AMT. IF WS-FLG = 'Y' DISPLAY 'FLAG ON' END-IF STOP RUN.
ファイル入出力
実際のソースファイルを読み込んで正規化し、出力ファイルに書き出す例を示す。
from pathlib import Path
def normalize_file(input_path: str, output_path: str, encoding: str = 'utf-8') -> None:
"""
COBOLソースファイルを正規化して出力ファイルに書き出す。
encoding: 'utf-8', 'cp930'(IBM拡張EBCDIC→Shift-JIS変換後)など環境に合わせて指定。
"""
source = Path(input_path).read_text(encoding=encoding, errors='replace')
normalizer = CobolNormalizer()
normalized_lines = normalizer.normalize(source)
Path(output_path).write_text('\n'.join(normalized_lines) + '\n', encoding='utf-8')
print(f"完了: {len(normalized_lines)} 命令 → {output_path}")
# 使用例
normalize_file('SAMPLE.CBL', 'SAMPLE_normalized.CBL')