Key phrase extraction prototype

Prototype key phrase extraction on 10 garden items, comparing two approaches from the write-up: LLM-based extraction via Ollama and statistical extraction via YAKE.

Model and settings

Model: gemma3:4b (Google Gemma 3, 4B parameter variant) via Ollama
Temperature: 0.1 (near-deterministic, minimizing randomness)
Max input: body text truncated to 1,500 words
Max output: 8 phrases per item

Prompt used

Extract 8 key phrases from this text.
Key phrases are specific concepts or topics (2-4 words each) that capture what the text is about.
Return ONLY the phrases, one per line. No numbering, no explanation.

Title: {title}

{body}

The prompt is deliberately simple: no few-shot examples, no role assignment. This is the baseline. Future iterations could test more structured prompts or include examples of good/bad phrases.

YAKE settings

Language: en
Max phrase length: 3 words (n=3)
Deduplication threshold: 0.7
No model needed, purely statistical

Prerequisites

Ollama running locally (ollama serve)
A generative model pulled: ollama pull gemma3:4b
Python with requests: pip install requests
For YAKE comparison: pip install yake

The code

"""
Key phrase extraction prototype: LLM vs YAKE on 10 garden items.
Run from the digital-garden project root:
    python key-phrase-extraction.py
"""

import json
import os
import re
import requests

CONTENT_DIR = "src/content"
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "gemma3:4b"

# --- Helpers ---

def load_markdown(path):
    """Read a markdown file, strip frontmatter, return (title, body)."""
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    # Strip YAML frontmatter
    title = ""
    if text.startswith("---"):
        end = text.find("---", 3)
        if end != -1:
            front = text[3:end]
            title_match = re.search(r'title:\s*"?([^"\n]+)"?', front)
            if title_match:
                title = title_match.group(1).strip()
            text = text[end + 3:].strip()

    # Strip wiki-links: [[target|display]] -> display, [[target]] -> target
    text = re.sub(r'\[\[([^]|]+)\|([^]]+)\]\]', r'\2', text)
    text = re.sub(r'\[\[([^]]+)\]\]', r'\1', text)

    return title, text


def pick_items(n=10):
    """Pick n items across collections, preferring variety."""
    items = []
    for collection in ["field-notes", "articles", "seeds", "experiments", "weblinks"]:
        folder = os.path.join(CONTENT_DIR, collection)
        if not os.path.isdir(folder):
            continue
        for fname in sorted(os.listdir(folder)):
            if fname.endswith(".md"):
                path = os.path.join(folder, fname)
                title, body = load_markdown(path)
                if len(body) > 50:  # skip very short items
                    items.append({
                        "path": path,
                        "collection": collection,
                        "title": title,
                        "body": body,
                    })
    # Take a spread: pick every nth item
    step = max(1, len(items) // n)
    selected = items[::step][:n]
    print(f"Selected {len(selected)} items from {len(items)} total\n")
    return selected


# --- LLM extraction ---

def extract_keyphrases_llm(title, body, max_phrases=8):
    """Use Ollama to extract key phrases from a garden item."""
    # Truncate very long texts to ~1500 words
    words = body.split()
    if len(words) > 1500:
        body = " ".join(words[:1500])

    prompt = f"""Extract {max_phrases} key phrases from this text.
Key phrases are specific concepts or topics (2-4 words each) that capture what the text is about.
Return ONLY the phrases, one per line. No numbering, no explanation.

Title: {title}

{body}"""

    response = requests.post(OLLAMA_URL, json={
        "model": MODEL,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.1}
    })
    response.raise_for_status()
    result = response.json()["response"].strip()

    # Parse: one phrase per line, clean up
    phrases = []
    for line in result.split("\n"):
        line = line.strip().strip("-•*").strip()
        if line and len(line) < 80:
            phrases.append(line.lower())
    return phrases[:max_phrases]


# --- YAKE extraction ---

def extract_keyphrases_yake(body, max_phrases=8):
    """Use YAKE for statistical key phrase extraction."""
    try:
        import yake
    except ImportError:
        return ["(yake not installed)"]

    kw_extractor = yake.KeywordExtractor(
        lan="en",
        n=3,           # max phrase length
        dedupLim=0.7,  # deduplication threshold
        top=max_phrases,
    )
    keywords = kw_extractor.extract_keywords(body)
    return [kw.lower() for kw, score in keywords]


# --- Main ---

def main():
    items = pick_items(10)

    results = []
    for item in items:
        print(f"Processing: {item['title']}")
        print(f"  Collection: {item['collection']}")
        print(f"  Words: {len(item['body'].split())}")

        llm_phrases = extract_keyphrases_llm(item["title"], item["body"])
        yake_phrases = extract_keyphrases_yake(item["body"])

        print(f"  LLM phrases:  {llm_phrases}")
        print(f"  YAKE phrases: {yake_phrases}")

        # Find overlap
        llm_set = set(llm_phrases)
        yake_set = set(yake_phrases)
        overlap = llm_set & yake_set
        if overlap:
            print(f"  Overlap: {overlap}")
        print()

        results.append({
            "title": item["title"],
            "collection": item["collection"],
            "word_count": len(item["body"].split()),
            "llm_phrases": llm_phrases,
            "yake_phrases": yake_phrases,
            "overlap": list(overlap),
        })

    # Save results
    with open("keyphrase-results.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"Results saved to keyphrase-results.json")

    # Summary stats
    avg_llm = sum(len(r["llm_phrases"]) for r in results) / len(results)
    avg_yake = sum(len(r["yake_phrases"]) for r in results) / len(results)
    avg_overlap = sum(len(r["overlap"]) for r in results) / len(results)
    print(f"\nSummary:")
    print(f"  Avg LLM phrases:  {avg_llm:.1f}")
    print(f"  Avg YAKE phrases: {avg_yake:.1f}")
    print(f"  Avg overlap:      {avg_overlap:.1f}")


if __name__ == "__main__":
    main()

How to run

Make sure Ollama is running: ollama serve
Pull a generative model if needed: ollama pull gemma3:4b
Install dependencies: pip install requests yake
Run from the project root: python key-phrase-extraction.py

The script picks 10 items spread across collections, runs both extractors, and saves results to keyphrase-results.json.

What to look for

Do LLM phrases capture the actual topic better than YAKE?
How does quality differ for short items (under 100 words) vs longer ones?
Are there useful phrases that both methods agree on?
How long does LLM extraction take per item?

Analysis of results goes in a linked field note.