Source code for app.backend.claim_splitter
"""Atomic claim decomposition for /api/analyse.
Primary path: prompt the HF generation model to break its response into
atomic, self-contained factual claims. This mirrors the decomposition
step of fact-checking pipelines such as FActScore (Min et al. 2023).
Fallback: NLTK sentence segmentation when the LLM path fails (missing
token, API error, or unparseable output). Sentences are a coarser unit
but keep the endpoint functional instead of returning nothing.
"""
import json
import nltk
from huggingface_hub import InferenceClient
from nltk.tokenize import sent_tokenize
from app.backend.constants import HF_FALLBACK_MODEL as MODEL, HF_TOKEN
_SYSTEM_PROMPT = """\
You decompose text responses into atomic, self-contained factual claims,
returned as a JSON array of strings.
Rules:
- Each claim expresses a single factual assertion.
- Each claim is self-contained: resolve pronouns and include the subject
and any context needed to understand the claim standalone.
- Use only information present in the input; do not add new facts.
- Split compound sentences (e.g., dose + frequency + duration become
separate claims).
- Preserve numbers, units, and proper nouns exactly.
- Skip greetings, transitions, and pure filler.
- If the input contains no verifiable claims, return [].
Output: a JSON array of strings. No prose, no code fences.
"""
_USER_PROMPT = 'Decompose this response:\n\n"""\n{text}\n"""'
_MAX_DECOMPOSE_TOKENS = 600
def _parse_json_array(raw: str) -> list[str] | None:
"""Parse a JSON array of non-empty strings from the LLM's output.
`json.loads` alone fails when the model wraps the array in prose or
a markdown fence, so the outermost `[...]` is extracted before
parsing. Returns None if no array is present or the result is not a
list of strings; the caller treats None as failure and falls back.
"""
start = raw.find("[")
end = raw.rfind("]")
if start == -1 or end == -1 or end <= start:
return None
try:
parsed = json.loads(raw[start : end + 1])
except json.JSONDecodeError:
return None
if not isinstance(parsed, list):
return None
return [s.strip() for s in parsed if isinstance(s, str) and s.strip()]
def _llm_decompose(text: str) -> list[str] | None:
"""Ask the HF model for an atomic-claim decomposition.
Returns None on any failure (missing token, API error, unparseable
output) so the caller can fall back to sentence segmentation.
"""
if not HF_TOKEN:
return None
try:
client = InferenceClient(MODEL, token=HF_TOKEN)
response = client.chat_completion(
[
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": _USER_PROMPT.format(text=text)},
],
max_tokens=_MAX_DECOMPOSE_TOKENS,
temperature=0.0,
)
raw = response.choices[0].message.content or ""
except Exception:
return None
claims = _parse_json_array(raw)
return claims if claims else None
def _nltk_sentences(text: str) -> list[str]:
"""Segment text into sentences; download punkt_tab on first use.
The tokenizer data is a one-time ~10 MB download, cached by NLTK in
the user's home directory. If the download itself fails (e.g. no
network), we give up splitting and return the whole text as one unit.
"""
try:
sentences = sent_tokenize(text)
except LookupError:
if not nltk.download("punkt_tab", quiet=True):
return [text]
sentences = sent_tokenize(text)
return [s.strip() for s in sentences if s.strip()] or [text]
[docs]
def decompose_into_claims(text: str) -> list[str]:
"""Split an LLM response into atomic, self-contained claims.
Tries LLM-based atomic decomposition first; on failure, falls back to
NLTK sentence segmentation.
"""
text = text.strip()
if not text:
return []
claims = _llm_decompose(text)
if claims:
return claims
return _nltk_sentences(text)