Skip to content

COSMICChunk

Output chunk with rich metadata.

Class Definition

@dataclass(frozen=True)
class COSMICChunk:
    """
    Immutable chunk output from COSMIC pipeline.

    This is a frozen dataclass - all attributes are read-only
    after creation.
    """

Attributes

Identity

Attribute Type Description
chunk_id str Unique identifier
document_id str Source document ID
chunk_index int Position in document (0-indexed)

Content

Attribute Type Description
text str Chunk text content
token_count int Token count (tiktoken)

Location

Attribute Type Description
page_start int First page (1-indexed)
page_end int Last page
char_start int Character offset start
char_end int Character offset end
sentence_indices tuple[int, ...] Included sentence indices

Domain Classification

Attribute Type Description
domain str Primary domain
subdomain Optional[str] Optional subdomain
domain_confidence float Confidence (0-1)

Quality Metrics

Attribute Type Description
coherence_score float Internal coherence (0-1)
boundary_confidence float Boundary confidence (0-1)
processing_mode ProcessingMode Pipeline used

Cross-References

Attribute Type Description
cross_references tuple[str, ...] Referenced chunk IDs
referenced_by tuple[str, ...] IDs that reference this
has_unresolved_references bool Unresolved flag

Intent Analysis

Attribute Type Description
intent Intent Primary intent
intent_confidence float Intent confidence (0-1)

Structure

Attribute Type Description
contains_heading bool Has heading
contains_list bool Has list
contains_table bool Has table

Metadata

Attribute Type Description
metadata dict Additional metadata

Usage Examples

Accessing Attributes

from cosmic import COSMICChunker, Document

chunker = COSMICChunker()
doc = Document.from_text("Your text here...")
chunks = chunker.chunk_document(doc)

for chunk in chunks:
    # Identity
    print(f"ID: {chunk.chunk_id}")
    print(f"Index: {chunk.chunk_index}")

    # Content
    print(f"Tokens: {chunk.token_count}")
    print(f"Text: {chunk.text[:100]}...")

    # Domain
    print(f"Domain: {chunk.domain}")
    print(f"Confidence: {chunk.domain_confidence:.2f}")

    # Quality
    print(f"Coherence: {chunk.coherence_score:.2f}")
    print(f"Mode: {chunk.processing_mode}")

    # Intent
    print(f"Intent: {chunk.intent}")

Filtering Chunks

# High coherence chunks
coherent = [c for c in chunks if c.coherence_score > 0.8]

# Technical domain
technical = [c for c in chunks if c.domain == "technical"]

# With references
with_refs = [c for c in chunks if c.cross_references]

# Contains headings
with_headings = [c for c in chunks if c.contains_heading]

Serialization

import json

def chunk_to_dict(chunk: COSMICChunk) -> dict:
    return {
        "chunk_id": chunk.chunk_id,
        "document_id": chunk.document_id,
        "chunk_index": chunk.chunk_index,
        "text": chunk.text,
        "token_count": chunk.token_count,
        "location": {
            "page_start": chunk.page_start,
            "page_end": chunk.page_end,
            "char_start": chunk.char_start,
            "char_end": chunk.char_end,
        },
        "domain": {
            "primary": chunk.domain,
            "subdomain": chunk.subdomain,
            "confidence": chunk.domain_confidence,
        },
        "quality": {
            "coherence_score": chunk.coherence_score,
            "boundary_confidence": chunk.boundary_confidence,
            "processing_mode": chunk.processing_mode.value,
        },
        "references": {
            "references": list(chunk.cross_references),
            "referenced_by": list(chunk.referenced_by),
            "has_unresolved": chunk.has_unresolved_references,
        },
        "intent": {
            "primary": chunk.intent.value,
            "confidence": chunk.intent_confidence,
        },
        "structure": {
            "contains_heading": chunk.contains_heading,
            "contains_list": chunk.contains_list,
            "contains_table": chunk.contains_table,
        },
        "metadata": chunk.metadata,
    }

# Export to JSON
data = [chunk_to_dict(c) for c in chunks]
json.dump(data, open("chunks.json", "w"), indent=2)

ProcessingMode Enum

from cosmic import ProcessingMode

# Values
ProcessingMode.FULL_COSMIC      # Full 6-stage pipeline
ProcessingMode.SEMANTIC_ONLY    # DCS-based boundaries
ProcessingMode.SLIDING_WINDOW   # Similarity-based
ProcessingMode.FIXED_LENGTH     # Token-based

# Check processing mode
if chunk.processing_mode == ProcessingMode.FULL_COSMIC:
    print("Processed with full pipeline")

Intent Enum

from cosmic import Intent

Intent.DEFINE      # "What is X?"
Intent.EXPLAIN     # "How does X work?"
Intent.LIST        # "Steps to do X"
Intent.ARGUE       # "X is better because..."
Intent.DESCRIBE    # "X has properties..."
Intent.INSTRUCT    # "To do X, first..."
Intent.SUMMARIZE   # "In summary, X..."
Intent.COMPARE     # "X vs Y..."
Intent.NARRATE     # "Then X happened..."

Immutability

COSMICChunk is a frozen dataclass. Attributes cannot be modified:

chunk.text = "new text"  # Raises FrozenInstanceError

To create a modified version:

from dataclasses import replace

# Create new chunk with modified attribute
new_chunk = replace(chunk, metadata={"modified": True})