memory_utils.py

# memory_utils.py

import torch
from typing import Dict, List, Tuple, Optional
from titans_pytorch import MemoryAsContextTransformer, NeuralMemory

def load_model_from_checkpoint(checkpoint_path: str) -> MemoryAsContextTransformer:
    """
    Load model from checkpoint with all necessary configurations
    """
    checkpoint = torch.load(checkpoint_path)
    
    # Get hyperparameters from checkpoint
    hyperparams = checkpoint['hyperparams']
    
    # Initialize model with saved hyperparameters
    model = MemoryAsContextTransformer(
        num_tokens = 256,
        dim = 384,
        depth = 8,
        segment_len = hyperparams['WINDOW_SIZE'],
        num_persist_mem_tokens = hyperparams['NUM_PERSIST_MEM'],
        num_longterm_mem_tokens = hyperparams['NUM_LONGTERM_MEM'],
        neural_memory_layers = hyperparams['NEURAL_MEM_LAYERS'],
        neural_memory_segment_len = hyperparams['NEURAL_MEM_SEGMENT_LEN'],
        neural_mem_gate_attn_output = True,
        aux_kv_recon_loss_weight = hyperparams['KV_RECON_LOSS_WEIGHT'],
        use_flex_attn = True,
        sliding_window_attn = hyperparams['SLIDING_WINDOWS'],
        neural_memory_kwargs = dict(
            dim_head = 64,
            heads = 4,
            use_accelerated_scan = True,
            learned_mem_model_weights = hyperparams['LEARNED_MEM_MODEL_WEIGHTS'],
            default_model_kwargs = dict(
                depth = hyperparams['NEURAL_MEMORY_DEPTH'],
            )
        )
    )
    
    # Move to GPU if available
    if torch.cuda.is_available():
        model = model.cuda()
    
    # Load model state
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded checkpoint from batch {checkpoint['batch_idx']}")
    return model

def save_memory_state(model, save_path: str):
    """
    Save the neural memory states from all memory layers in the model
    """
    memory_states = {}
    
    # Extract memory states from each layer
    for idx, (attn, _) in enumerate(model.layers):
        if hasattr(attn, 'neural_mem'):
            mem = attn.neural_mem
            if mem is not None and hasattr(mem, 'previous_state'):
                memory_states[f'layer_{idx}_memory'] = {
                    'previous_state': mem.previous_state
                }
    
    # Save to file
    torch.save(memory_states, save_path)
    print(f"Memory states saved to {save_path}")
    return memory_states

def load_memory_state(model, load_path: str):
    """
    Load and restore neural memory states to the model
    """
    if not torch.cuda.is_available():
        memory_states = torch.load(load_path, map_location='cpu')
    else:
        memory_states = torch.load(load_path)
    
    # Restore memory states to each layer
    for idx, (attn, _) in enumerate(model.layers):
        if hasattr(attn, 'neural_mem'):
            mem = attn.neural_mem
            if mem is not None and f'layer_{idx}_memory' in memory_states:
                mem.previous_state = memory_states[f'layer_{idx}_memory']['previous_state']
    
    print(f"Memory states loaded from {load_path}")
    return memory_states

def process_text_and_update_memory(
    model,
    text: str,
    chunk_size: int = 512,
    save_memory: bool = True,
    memory_path: Optional[str] = None
) -> Tuple[torch.Tensor, Dict]:
    """
    Process text through the model while updating its neural memory
    """
    model.eval()
    
    # Convert text to tokens
    tokens = torch.tensor([[ord(c) for c in text]], device='cuda' if torch.cuda.is_available() else 'cpu')
    
    # Process in chunks to update memory
    chunks = tokens.split(chunk_size, dim=1)
    last_output = None
    
    with torch.no_grad():
        for chunk in chunks:
            # Process chunk
            output = model(chunk)
            last_output = output
    
    # Save updated memory state if requested
    memory_states = None
    if save_memory and memory_path:
        memory_states = save_memory_state(model, memory_path)
    
    return last_output, memory_states

def decode_token(token):
    """Decode a single token to character"""
    return str(chr(max(32, token)))

def decode_tokens(tokens):
    """Decode a sequence of tokens to text"""
    return ''.join(list(map(decode_token, tokens)))