『税理士のためのNotebookLM活用術』補助ページ

参考サイト

Colab用コード

法令取得用コード　法人税法22条の2

import requests
import xml.etree.ElementTree as ET

LAW_ID = "340AC0000000034"  # 法人税法
LAW_NAME = "法人税法"
TARGET_ARTICLE_TITLE = "第二十二条の二"
TARGET_ARTICLE_CAPTION = "（収益の額）"
API_URL = f"https://laws.e-gov.go.jp/api/2/law_data/{LAW_ID}"


def get_text(elem):
    """要素配下のテキストをすべて結合して返す"""
    if elem is None:
        return ""
    return "".join(elem.itertext()).strip()


def fetch_xml_root():
    r = requests.get(API_URL, params={"response_format": "xml"}, timeout=30)
    r.raise_for_status()
    r.encoding = "utf-8"
    root = ET.fromstring(r.text)
    return root.find(".//law_full_text")


def find_article_by_title(root, title):
    """
    XML全体から <Article> を走査し、
    <ArticleTitle> が一致する条を返す
    """
    for article in root.iter("Article"):
        article_title = get_text(article.find("./ArticleTitle"))
        if article_title == title:
            return article
    return None


def sentence_block_text(parent, block_tag):
    """
    ParagraphSentence / ItemSentence / SubitemSentence などの
    直下 Sentence だけを連結して返す
    """
    block = parent.find(f"./{block_tag}")
    if block is None:
        return ""

    texts = []
    for s in block.findall("./Sentence"):
        txt = get_text(s)
        if txt:
            texts.append(txt)
    return "".join(texts).strip()


def find_direct_subitems(elem):
    """
    直下の Subitem1, Subitem2, ... を順番に返す
    """
    subitems = []
    for child in list(elem):
        if child.tag.startswith("Subitem"):
            subitems.append(child)
    return subitems


def subitem_title_and_sentence_tag(tag_name):
    """
    Subitem1 -> (Subitem1Title, Subitem1Sentence)
    """
    return f"{tag_name}Title", f"{tag_name}Sentence"


def item_like_to_markdown(elem, indent=0):
    """
    Item / Subitem1 / Subitem2 ... を再帰的に Markdown 化
    """
    lines = []
    prefix = "  " * indent + "- "
    tag = elem.tag

    if tag == "Item":
        title = get_text(elem.find("./ItemTitle"))
        body = sentence_block_text(elem, "ItemSentence")
    else:
        title_tag, sentence_tag = subitem_title_and_sentence_tag(tag)
        title = get_text(elem.find(f"./{title_tag}"))
        body = sentence_block_text(elem, sentence_tag)

    line = f"{prefix}{title}　{body}".rstrip()
    if title or body:
        lines.append(line)

    for sub in find_direct_subitems(elem):
        lines.extend(item_like_to_markdown(sub, indent + 1))

    return lines


def paragraph_to_markdown(paragraph):
    """Paragraph を Markdown に変換する"""
    num = get_text(paragraph.find("./ParagraphNum"))
    body = sentence_block_text(paragraph, "ParagraphSentence")

    lines = []
    if num:
        lines.append(f"{num}　{body}")
    else:
        lines.append(body)

    for item in paragraph.findall("./Item"):
        lines.extend(item_like_to_markdown(item, indent=0))

    return "\n\n".join([line for line in lines if line.strip()])


def article_to_markdown(article):
    """Article を Markdown に変換する"""
    caption = get_text(article.find("./ArticleCaption")) or TARGET_ARTICLE_CAPTION
    title = get_text(article.find("./ArticleTitle"))

    lines = [f"# {LAW_NAME}"]

    if title and caption:
        lines.append(f"## {title}{caption}")
    elif title:
        lines.append(f"## {title}")

    for p in article.findall("./Paragraph"):
        md = paragraph_to_markdown(p)
        if md:
            lines.append(md)

    return "\n\n".join(lines)


def main():
    root = fetch_xml_root()

    article = find_article_by_title(root, TARGET_ARTICLE_TITLE)
    if article is None:
        raise ValueError(f"{TARGET_ARTICLE_TITLE} が見つかりませんでした。")

    markdown_text = article_to_markdown(article)

    output_file = "houjinzei_22_2.md"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(markdown_text)

    print(markdown_text)
    print(f"\n保存先: {output_file}")


if __name__ == "__main__":
    main()

通達取得用コード　第2款固定資産の譲渡等に係る収益　

import requests
from bs4 import BeautifulSoup
import re
import os

TARGET_URL = "https://www.nta.go.jp/law/tsutatsu/kihon/hojin/02/02_01_02.htm"
OUTPUT_FILENAME = "第2款_固定資産の譲渡等に係る収益.md"


def fetch_soup(url: str) -> BeautifulSoup:
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    response.encoding = response.apparent_encoding
    return BeautifulSoup(response.text, "html.parser")


def clean_text(text: str) -> str:
    text = text.replace("\u3000", " ")
    text = text.replace("−", "－").replace("-", "－")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def extract_main_content(soup: BeautifulSoup):
    candidates = [
        soup.find("div", id="main_content"),
        soup.find("div", id="contents"),
        soup.find("main"),
        soup.body,
    ]
    for candidate in candidates:
        if candidate is not None:
            return candidate
    return soup


def is_breadcrumb_or_noise(text: str) -> bool:
    noise = {
        "ホーム",
        "法令等",
        "法令解釈通達",
        "このページの先頭へ",
        "前のページへ",
        "次のページへ",
    }
    return text in noise


def is_parenthesized_heading(text: str) -> bool:
    return text.startswith("（") and text.endswith("）")


def normalize_heading(text: str) -> str:
    return text.strip().strip("（）").strip()


def should_skip_container(el) -> bool:
    classes = el.get("class", [])
    class_str = " ".join(classes).lower()
    if any(key in class_str for key in ["breadcrumb", "topicpath", "pankuzu", "nav", "pagetop"]):
        return True
    return False


def normalize_number_string(text: str) -> str:
    # 通達番号の前後や途中にある余計な空白を除去
    text = text.replace("−", "－").replace("-", "－")
    text = re.sub(r"\s+", "", text)
    return text


def looks_like_number_only(text: str) -> bool:
    compact = normalize_number_string(text)
    return bool(re.fullmatch(r"\d+－\d+－\d+", compact))


def split_number_and_rest(text: str):
    """
    例:
    '2－1－18 法人が...'
    '2 －1－18 法人が...'
    '2－1－18法人が...'
    を
    ('2－1－18', '法人が...')
    に分ける
    """
    normalized = text.replace("−", "－").replace("-", "－").strip()

    m = re.match(r"^\s*(\d+\s*－\s*\d+\s*－\s*\d+)\s*(.*)$", normalized)
    if not m:
        return None, text

    number = normalize_number_string(m.group(1))
    rest = m.group(2).strip()
    return number, rest


def scrape_to_markdown(url: str, output_filename: str) -> str:
    soup = fetch_soup(url)
    main_content = extract_main_content(soup)

    title_tag = soup.find("h1")
    title = clean_text(title_tag.get_text()) if title_tag else "法人税法基本通達"

    markdown_lines = [f"# {title}", ""]

    pending_heading = None

    elements = main_content.find_all(["h2", "h3", "h4", "p", "dt", "dd", "li"])

    for el in elements:
        if should_skip_container(el):
            continue

        text = clean_text(el.get_text(" ", strip=True))
        if not text or is_breadcrumb_or_noise(text):
            continue

        if text == title:
            continue

        # h2, h3, h4 のうち、カッコ書きは通達見出しとして保留
        if el.name == "h2":
            if is_parenthesized_heading(text):
                pending_heading = normalize_heading(text)
            else:
                markdown_lines.append(f"## {text}")
                markdown_lines.append("")
                pending_heading = None
            continue

        if el.name == "h3":
            if is_parenthesized_heading(text):
                pending_heading = normalize_heading(text)
            else:
                markdown_lines.append(f"### {text}")
                markdown_lines.append("")
                pending_heading = None
            continue

        if el.name == "h4":
            if is_parenthesized_heading(text):
                pending_heading = normalize_heading(text)
            else:
                markdown_lines.append(f"#### {text}")
                markdown_lines.append("")
                pending_heading = None
            continue

        if is_parenthesized_heading(text):
            pending_heading = normalize_heading(text)
            continue

        # 番号だけの行
        if looks_like_number_only(text):
            number = normalize_number_string(text)
            if pending_heading:
                markdown_lines.append(f"#### {number}　{pending_heading}")
                markdown_lines.append("")
                pending_heading = None
            else:
                markdown_lines.append(f"#### {number}")
                markdown_lines.append("")
            continue

        # 番号＋本文の行
        number, rest = split_number_and_rest(text)
        if number:
            if pending_heading:
                markdown_lines.append(f"#### {number}　{pending_heading}")
                markdown_lines.append("")
                if rest:
                    markdown_lines.append(rest)
                    markdown_lines.append("")
                pending_heading = None
            else:
                markdown_lines.append(f"#### {number}")
                markdown_lines.append("")
                if rest:
                    markdown_lines.append(rest)
                    markdown_lines.append("")
            continue

        # 注記
        if text.startswith("(注)") or text.startswith("（注）"):
            markdown_lines.append(text)
            markdown_lines.append("")
            continue

        # 号書き
        if re.match(r"^\(\d+\)", text) or re.match(r"^\d+\s", text):
            markdown_lines.append(text)
            markdown_lines.append("")
            continue

        # 通常本文
        markdown_lines.append(text)
        markdown_lines.append("")

    # 連続空行を整理
    cleaned_lines = []
    prev_blank = False
    for line in markdown_lines:
        blank = line.strip() == ""
        if blank and prev_blank:
            continue
        cleaned_lines.append(line)
        prev_blank = blank

    with open(output_filename, "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned_lines).strip() + "\n")

    return os.path.abspath(output_filename)


saved_path = scrape_to_markdown(TARGET_URL, OUTPUT_FILENAME)
print("保存完了:", saved_path)

with open(saved_path, "r", encoding="utf-8") as f:
    print("-" * 40)
    print(f.read()[:1500])

配布ファイル

ダウンロードはこちら

『税理士のためのNotebookLM活用術』補助ページ

参考サイト

Colab用コード

配布ファイル

アーカイブ

カテゴリー