目次
参考サイト
Colab用コード
法令取得用コード 法人税法22条の2
import requests
import xml.etree.ElementTree as ET
LAW_ID = "340AC0000000034" # 法人税法
LAW_NAME = "法人税法"
TARGET_ARTICLE_TITLE = "第二十二条の二"
TARGET_ARTICLE_CAPTION = "(収益の額)"
API_URL = f"https://laws.e-gov.go.jp/api/2/law_data/{LAW_ID}"
def get_text(elem):
"""要素配下のテキストをすべて結合して返す"""
if elem is None:
return ""
return "".join(elem.itertext()).strip()
def fetch_xml_root():
r = requests.get(API_URL, params={"response_format": "xml"}, timeout=30)
r.raise_for_status()
r.encoding = "utf-8"
root = ET.fromstring(r.text)
return root.find(".//law_full_text")
def find_article_by_title(root, title):
"""
XML全体から <Article> を走査し、
<ArticleTitle> が一致する条を返す
"""
for article in root.iter("Article"):
article_title = get_text(article.find("./ArticleTitle"))
if article_title == title:
return article
return None
def sentence_block_text(parent, block_tag):
"""
ParagraphSentence / ItemSentence / SubitemSentence などの
直下 Sentence だけを連結して返す
"""
block = parent.find(f"./{block_tag}")
if block is None:
return ""
texts = []
for s in block.findall("./Sentence"):
txt = get_text(s)
if txt:
texts.append(txt)
return "".join(texts).strip()
def find_direct_subitems(elem):
"""
直下の Subitem1, Subitem2, ... を順番に返す
"""
subitems = []
for child in list(elem):
if child.tag.startswith("Subitem"):
subitems.append(child)
return subitems
def subitem_title_and_sentence_tag(tag_name):
"""
Subitem1 -> (Subitem1Title, Subitem1Sentence)
"""
return f"{tag_name}Title", f"{tag_name}Sentence"
def item_like_to_markdown(elem, indent=0):
"""
Item / Subitem1 / Subitem2 ... を再帰的に Markdown 化
"""
lines = []
prefix = " " * indent + "- "
tag = elem.tag
if tag == "Item":
title = get_text(elem.find("./ItemTitle"))
body = sentence_block_text(elem, "ItemSentence")
else:
title_tag, sentence_tag = subitem_title_and_sentence_tag(tag)
title = get_text(elem.find(f"./{title_tag}"))
body = sentence_block_text(elem, sentence_tag)
line = f"{prefix}{title} {body}".rstrip()
if title or body:
lines.append(line)
for sub in find_direct_subitems(elem):
lines.extend(item_like_to_markdown(sub, indent + 1))
return lines
def paragraph_to_markdown(paragraph):
"""Paragraph を Markdown に変換する"""
num = get_text(paragraph.find("./ParagraphNum"))
body = sentence_block_text(paragraph, "ParagraphSentence")
lines = []
if num:
lines.append(f"{num} {body}")
else:
lines.append(body)
for item in paragraph.findall("./Item"):
lines.extend(item_like_to_markdown(item, indent=0))
return "\n\n".join([line for line in lines if line.strip()])
def article_to_markdown(article):
"""Article を Markdown に変換する"""
caption = get_text(article.find("./ArticleCaption")) or TARGET_ARTICLE_CAPTION
title = get_text(article.find("./ArticleTitle"))
lines = [f"# {LAW_NAME}"]
if title and caption:
lines.append(f"## {title}{caption}")
elif title:
lines.append(f"## {title}")
for p in article.findall("./Paragraph"):
md = paragraph_to_markdown(p)
if md:
lines.append(md)
return "\n\n".join(lines)
def main():
root = fetch_xml_root()
article = find_article_by_title(root, TARGET_ARTICLE_TITLE)
if article is None:
raise ValueError(f"{TARGET_ARTICLE_TITLE} が見つかりませんでした。")
markdown_text = article_to_markdown(article)
output_file = "houjinzei_22_2.md"
with open(output_file, "w", encoding="utf-8") as f:
f.write(markdown_text)
print(markdown_text)
print(f"\n保存先: {output_file}")
if __name__ == "__main__":
main()
通達取得用コード 第2款 固定資産の譲渡等に係る収益
import requests
from bs4 import BeautifulSoup
import re
import os
TARGET_URL = "https://www.nta.go.jp/law/tsutatsu/kihon/hojin/02/02_01_02.htm"
OUTPUT_FILENAME = "第2款_固定資産の譲渡等に係る収益.md"
def fetch_soup(url: str) -> BeautifulSoup:
response = requests.get(url, timeout=30)
response.raise_for_status()
response.encoding = response.apparent_encoding
return BeautifulSoup(response.text, "html.parser")
def clean_text(text: str) -> str:
text = text.replace("\u3000", " ")
text = text.replace("−", "-").replace("-", "-")
text = re.sub(r"\s+", " ", text)
return text.strip()
def extract_main_content(soup: BeautifulSoup):
candidates = [
soup.find("div", id="main_content"),
soup.find("div", id="contents"),
soup.find("main"),
soup.body,
]
for candidate in candidates:
if candidate is not None:
return candidate
return soup
def is_breadcrumb_or_noise(text: str) -> bool:
noise = {
"ホーム",
"法令等",
"法令解釈通達",
"このページの先頭へ",
"前のページへ",
"次のページへ",
}
return text in noise
def is_parenthesized_heading(text: str) -> bool:
return text.startswith("(") and text.endswith(")")
def normalize_heading(text: str) -> str:
return text.strip().strip("()").strip()
def should_skip_container(el) -> bool:
classes = el.get("class", [])
class_str = " ".join(classes).lower()
if any(key in class_str for key in ["breadcrumb", "topicpath", "pankuzu", "nav", "pagetop"]):
return True
return False
def normalize_number_string(text: str) -> str:
# 通達番号の前後や途中にある余計な空白を除去
text = text.replace("−", "-").replace("-", "-")
text = re.sub(r"\s+", "", text)
return text
def looks_like_number_only(text: str) -> bool:
compact = normalize_number_string(text)
return bool(re.fullmatch(r"\d+-\d+-\d+", compact))
def split_number_and_rest(text: str):
"""
例:
'2-1-18 法人が...'
'2 -1-18 法人が...'
'2-1-18法人が...'
を
('2-1-18', '法人が...')
に分ける
"""
normalized = text.replace("−", "-").replace("-", "-").strip()
m = re.match(r"^\s*(\d+\s*-\s*\d+\s*-\s*\d+)\s*(.*)$", normalized)
if not m:
return None, text
number = normalize_number_string(m.group(1))
rest = m.group(2).strip()
return number, rest
def scrape_to_markdown(url: str, output_filename: str) -> str:
soup = fetch_soup(url)
main_content = extract_main_content(soup)
title_tag = soup.find("h1")
title = clean_text(title_tag.get_text()) if title_tag else "法人税法基本通達"
markdown_lines = [f"# {title}", ""]
pending_heading = None
elements = main_content.find_all(["h2", "h3", "h4", "p", "dt", "dd", "li"])
for el in elements:
if should_skip_container(el):
continue
text = clean_text(el.get_text(" ", strip=True))
if not text or is_breadcrumb_or_noise(text):
continue
if text == title:
continue
# h2, h3, h4 のうち、カッコ書きは通達見出しとして保留
if el.name == "h2":
if is_parenthesized_heading(text):
pending_heading = normalize_heading(text)
else:
markdown_lines.append(f"## {text}")
markdown_lines.append("")
pending_heading = None
continue
if el.name == "h3":
if is_parenthesized_heading(text):
pending_heading = normalize_heading(text)
else:
markdown_lines.append(f"### {text}")
markdown_lines.append("")
pending_heading = None
continue
if el.name == "h4":
if is_parenthesized_heading(text):
pending_heading = normalize_heading(text)
else:
markdown_lines.append(f"#### {text}")
markdown_lines.append("")
pending_heading = None
continue
if is_parenthesized_heading(text):
pending_heading = normalize_heading(text)
continue
# 番号だけの行
if looks_like_number_only(text):
number = normalize_number_string(text)
if pending_heading:
markdown_lines.append(f"#### {number} {pending_heading}")
markdown_lines.append("")
pending_heading = None
else:
markdown_lines.append(f"#### {number}")
markdown_lines.append("")
continue
# 番号+本文の行
number, rest = split_number_and_rest(text)
if number:
if pending_heading:
markdown_lines.append(f"#### {number} {pending_heading}")
markdown_lines.append("")
if rest:
markdown_lines.append(rest)
markdown_lines.append("")
pending_heading = None
else:
markdown_lines.append(f"#### {number}")
markdown_lines.append("")
if rest:
markdown_lines.append(rest)
markdown_lines.append("")
continue
# 注記
if text.startswith("(注)") or text.startswith("(注)"):
markdown_lines.append(text)
markdown_lines.append("")
continue
# 号書き
if re.match(r"^\(\d+\)", text) or re.match(r"^\d+\s", text):
markdown_lines.append(text)
markdown_lines.append("")
continue
# 通常本文
markdown_lines.append(text)
markdown_lines.append("")
# 連続空行を整理
cleaned_lines = []
prev_blank = False
for line in markdown_lines:
blank = line.strip() == ""
if blank and prev_blank:
continue
cleaned_lines.append(line)
prev_blank = blank
with open(output_filename, "w", encoding="utf-8") as f:
f.write("\n".join(cleaned_lines).strip() + "\n")
return os.path.abspath(output_filename)
saved_path = scrape_to_markdown(TARGET_URL, OUTPUT_FILENAME)
print("保存完了:", saved_path)
with open(saved_path, "r", encoding="utf-8") as f:
print("-" * 40)
print(f.read()[:1500])