COSMICChunk
Output chunk with rich metadata.
Class Definition
@dataclass(frozen=True)
class COSMICChunk:
"""
Immutable chunk output from COSMIC pipeline.
This is a frozen dataclass - all attributes are read-only
after creation.
"""
Attributes
Identity
| Attribute |
Type |
Description |
chunk_id |
str |
Unique identifier |
document_id |
str |
Source document ID |
chunk_index |
int |
Position in document (0-indexed) |
Content
| Attribute |
Type |
Description |
text |
str |
Chunk text content |
token_count |
int |
Token count (tiktoken) |
Location
| Attribute |
Type |
Description |
page_start |
int |
First page (1-indexed) |
page_end |
int |
Last page |
char_start |
int |
Character offset start |
char_end |
int |
Character offset end |
sentence_indices |
tuple[int, ...] |
Included sentence indices |
Domain Classification
| Attribute |
Type |
Description |
domain |
str |
Primary domain |
subdomain |
Optional[str] |
Optional subdomain |
domain_confidence |
float |
Confidence (0-1) |
Quality Metrics
| Attribute |
Type |
Description |
coherence_score |
float |
Internal coherence (0-1) |
boundary_confidence |
float |
Boundary confidence (0-1) |
processing_mode |
ProcessingMode |
Pipeline used |
Cross-References
| Attribute |
Type |
Description |
cross_references |
tuple[str, ...] |
Referenced chunk IDs |
referenced_by |
tuple[str, ...] |
IDs that reference this |
has_unresolved_references |
bool |
Unresolved flag |
Intent Analysis
| Attribute |
Type |
Description |
intent |
Intent |
Primary intent |
intent_confidence |
float |
Intent confidence (0-1) |
Structure
| Attribute |
Type |
Description |
contains_heading |
bool |
Has heading |
contains_list |
bool |
Has list |
contains_table |
bool |
Has table |
| Attribute |
Type |
Description |
metadata |
dict |
Additional metadata |
Usage Examples
Accessing Attributes
from cosmic import COSMICChunker, Document
chunker = COSMICChunker()
doc = Document.from_text("Your text here...")
chunks = chunker.chunk_document(doc)
for chunk in chunks:
# Identity
print(f"ID: {chunk.chunk_id}")
print(f"Index: {chunk.chunk_index}")
# Content
print(f"Tokens: {chunk.token_count}")
print(f"Text: {chunk.text[:100]}...")
# Domain
print(f"Domain: {chunk.domain}")
print(f"Confidence: {chunk.domain_confidence:.2f}")
# Quality
print(f"Coherence: {chunk.coherence_score:.2f}")
print(f"Mode: {chunk.processing_mode}")
# Intent
print(f"Intent: {chunk.intent}")
Filtering Chunks
# High coherence chunks
coherent = [c for c in chunks if c.coherence_score > 0.8]
# Technical domain
technical = [c for c in chunks if c.domain == "technical"]
# With references
with_refs = [c for c in chunks if c.cross_references]
# Contains headings
with_headings = [c for c in chunks if c.contains_heading]
Serialization
import json
def chunk_to_dict(chunk: COSMICChunk) -> dict:
return {
"chunk_id": chunk.chunk_id,
"document_id": chunk.document_id,
"chunk_index": chunk.chunk_index,
"text": chunk.text,
"token_count": chunk.token_count,
"location": {
"page_start": chunk.page_start,
"page_end": chunk.page_end,
"char_start": chunk.char_start,
"char_end": chunk.char_end,
},
"domain": {
"primary": chunk.domain,
"subdomain": chunk.subdomain,
"confidence": chunk.domain_confidence,
},
"quality": {
"coherence_score": chunk.coherence_score,
"boundary_confidence": chunk.boundary_confidence,
"processing_mode": chunk.processing_mode.value,
},
"references": {
"references": list(chunk.cross_references),
"referenced_by": list(chunk.referenced_by),
"has_unresolved": chunk.has_unresolved_references,
},
"intent": {
"primary": chunk.intent.value,
"confidence": chunk.intent_confidence,
},
"structure": {
"contains_heading": chunk.contains_heading,
"contains_list": chunk.contains_list,
"contains_table": chunk.contains_table,
},
"metadata": chunk.metadata,
}
# Export to JSON
data = [chunk_to_dict(c) for c in chunks]
json.dump(data, open("chunks.json", "w"), indent=2)
ProcessingMode Enum
from cosmic import ProcessingMode
# Values
ProcessingMode.FULL_COSMIC # Full 6-stage pipeline
ProcessingMode.SEMANTIC_ONLY # DCS-based boundaries
ProcessingMode.SLIDING_WINDOW # Similarity-based
ProcessingMode.FIXED_LENGTH # Token-based
# Check processing mode
if chunk.processing_mode == ProcessingMode.FULL_COSMIC:
print("Processed with full pipeline")
Intent Enum
from cosmic import Intent
Intent.DEFINE # "What is X?"
Intent.EXPLAIN # "How does X work?"
Intent.LIST # "Steps to do X"
Intent.ARGUE # "X is better because..."
Intent.DESCRIBE # "X has properties..."
Intent.INSTRUCT # "To do X, first..."
Intent.SUMMARIZE # "In summary, X..."
Intent.COMPARE # "X vs Y..."
Intent.NARRATE # "Then X happened..."
Immutability
COSMICChunk is a frozen dataclass. Attributes cannot be modified:
chunk.text = "new text" # Raises FrozenInstanceError
To create a modified version:
from dataclasses import replace
# Create new chunk with modified attribute
new_chunk = replace(chunk, metadata={"modified": True})