diff --git a/competitions/data.py b/competitions/data.py index 0a0afcc..d001a99 100644 --- a/competitions/data.py +++ b/competitions/data.py @@ -11,6 +11,8 @@ class CompetitionId(IntEnum): B7_MULTI_CHOICE = 2 + INSTRUCT_8B = 3 + # Overwrite the default __repr__, which doesn't work with # bt.logging for some unknown reason. def __repr__(self) -> str: diff --git a/constants/__init__.py b/constants/__init__.py index 9da1130..5a2b974 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -15,11 +15,14 @@ from transformers import ( BartForCausalLM, FalconForCausalLM, + Gemma2ForCausalLM, GemmaForCausalLM, GPTNeoXForCausalLM, LlamaForCausalLM, MistralForCausalLM, + Phi3ForCausalLM, PhiForCausalLM, + Qwen2ForCausalLM, ) from competitions.data import CompetitionId @@ -30,7 +33,7 @@ # Project Constants. # --------------------------------- -__version__ = "2.5.1" +__version__ = "2.6.0" version_split = __version__.split(".") __spec_version__ = ( (1000 * int(version_split[0])) @@ -45,16 +48,16 @@ # Block the subnet was registered. GENESIS_BLOCK = 3138611 # Define the number of blocks per vali "sync". This cadence is used to align validator behavior for better vtrust. -SYNC_BLOCK_CADENCE = 180 +SYNC_BLOCK_CADENCE = 270 # Rough estimate of the number of seconds per block. SECONDS_PER_BLOCK = 12 # Validator weight moving average term. -# At 0.9 a model will go from 0 -> 0.190 in 2 cycles and from 0 -> 0.83 in 17 cycles. -ALPHA = 0.9 +# At 0.85 a model will go from 0 -> 0.278 in 2 cycles and from 0 -> 0.833 in 11 cycles. +ALPHA = 0.85 # Any miners with a combined competition weight below this threshold will instead receive 0 weight. # This is intended to help vtrust in conjunction with a low alpha by handling the tail ends. -# At 1 eval per 180 blocks, newly winning models will start recieving weight after ~360 blocks. -# Previously winning models will phase out after ~3060 blocks, at which point only the new winner will have weight. +# At 1 eval per 270 blocks, newly winning models will start recieving weight after ~540 blocks. +# Previously winning models will phase out after ~2970 blocks, at which point only the new winner will have weight. MIN_WEIGHT_THRESHOLD = 0.18 # The validator WANDB project. @@ -75,7 +78,8 @@ WEIGHT_SYNC_VALI_MIN_STAKE = 100_000 # Minimum percent of weight on a vali for a miner to be considered a top miner. # Since there can be multiple competitions at different reward percentages we can't just check biggest. -WEIGHT_SYNC_MINER_MIN_PERCENT = 0.10 +# Since we only set weights per competition with a threshold of 0.18 we can just take any percent here. +WEIGHT_SYNC_MINER_MIN_PERCENT = 0.01 # The root directory of this project. ROOT_DIR = Path(__file__).parent.parent # The maximum bytes for the hugging face repo. @@ -98,7 +102,7 @@ kwargs={ "torch_dtype": torch.bfloat16, }, - eval_block_delay=1200, # ~4 hours. + eval_block_delay=1600, # ~5 hours. norm_validation_constraints=NormValidationConstraints( norm_eps_soft=200, norm_eps_soft_percent_threshold=0.15, @@ -107,10 +111,36 @@ epsilon_func=LinearDecay(0.05, 0.01, 7200 * 5), # Decay over ~5 days. max_bytes=15 * 1024 * 1024 * 1024, ), + CompetitionId.INSTRUCT_8B: ModelConstraints( + max_model_parameter_size=8_100_000_000, + sequence_length=4096, + allowed_architectures=[ + BartForCausalLM, + FalconForCausalLM, + Gemma2ForCausalLM, + GemmaForCausalLM, + GPTNeoXForCausalLM, + LlamaForCausalLM, + MistralForCausalLM, + Phi3ForCausalLM, + PhiForCausalLM, + ], + tokenizer=None, # Any tokenizer can be used. + kwargs={ + "torch_dtype": torch.bfloat16, + }, + eval_block_delay=1600, # ~5 hours. + norm_validation_constraints=NormValidationConstraints( + norm_eps_soft=200, + norm_eps_soft_percent_threshold=0.15, + norm_eps_hard=1000, + ), + epsilon_func=LinearDecay(0.05, 0.01, 7200 * 5), # Decay over ~5 days. + max_bytes=20 * (1024**3), + ), } -# Block to start including fineweb data. -IF_EVAL_BLOCK = 4_344_030 +INSTRUCT_8B_BLOCK = 4_451_695 # Schedule of competitions by block. COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [ @@ -127,7 +157,7 @@ method_id=EvalMethodId.MULTIPLE_CHOICE, dataset_id=DatasetId.SYNTHETIC_MMLU, normalization_id=NormalizationId.NONE, - weight=0.9, + weight=0.85, ), EvalTask( name="WORD_SORTING", @@ -145,24 +175,31 @@ normalization_kwargs={"ceiling": 20.0}, weight=0.05, ), + EvalTask( + name="IF_EVAL_V1", + method_id=EvalMethodId.IF_EVAL, + dataset_id=DatasetId.SYNTHETIC_IF_EVAL, + normalization_id=NormalizationId.NONE, + weight=0.05, + ), ], ), ], ), ( - IF_EVAL_BLOCK, + INSTRUCT_8B_BLOCK, [ Competition( CompetitionId.B7_MULTI_CHOICE, MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MULTI_CHOICE], - 1.0, + 0.9, eval_tasks=[ EvalTask( name="SYNTHETIC_MMLU", method_id=EvalMethodId.MULTIPLE_CHOICE, dataset_id=DatasetId.SYNTHETIC_MMLU, normalization_id=NormalizationId.NONE, - weight=0.85, + weight=0.8, ), EvalTask( name="WORD_SORTING", @@ -178,8 +215,45 @@ dataset_id=DatasetId.FINEWEB, normalization_id=NormalizationId.INVERSE_EXPONENTIAL, normalization_kwargs={"ceiling": 20.0}, + weight=0.1, + ), + EvalTask( + name="IF_EVAL_V1", + method_id=EvalMethodId.IF_EVAL, + dataset_id=DatasetId.SYNTHETIC_IF_EVAL, + normalization_id=NormalizationId.NONE, + weight=0.05, + ), + ], + ), + Competition( + CompetitionId.INSTRUCT_8B, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.INSTRUCT_8B], + 0.1, + eval_tasks=[ + EvalTask( + name="SYNTHETIC_MMLU", + method_id=EvalMethodId.MULTIPLE_CHOICE, + dataset_id=DatasetId.SYNTHETIC_MMLU, + normalization_id=NormalizationId.NONE, + weight=0.8, + ), + EvalTask( + name="WORD_SORTING", + method_id=EvalMethodId.REFERENCE_LOSS, + dataset_id=DatasetId.WORD_SORTING, + normalization_id=NormalizationId.INVERSE_EXPONENTIAL, + normalization_kwargs={"ceiling": 40.0}, weight=0.05, ), + EvalTask( + name="FINEWEB", + method_id=EvalMethodId.TEXT_LOSS, + dataset_id=DatasetId.FINEWEB, + normalization_id=NormalizationId.INVERSE_EXPONENTIAL, + normalization_kwargs={"ceiling": 20.0}, + weight=0.1, + ), EvalTask( name="IF_EVAL_V1", method_id=EvalMethodId.IF_EVAL, diff --git a/docs/competitions.md b/docs/competitions.md index 6f7d29c..d36ad54 100644 --- a/docs/competitions.md +++ b/docs/competitions.md @@ -18,6 +18,20 @@ Models submitted to this competition are evaluated on a set of evaluation tasks, [Code Link](https://github.com/macrocosm-os/finetuning/blob/94e8fd92ab4158e1e4a425a9562695eebafa27b1/constants/__init__.py#L128) +## Competition INSTRUCT_8B: + +### Goal + +The goal of this competition is to train a SOTA instruct 8B model. This competition provides more freedom to miners than other competitions: there are no restrictions on the tokenizer used and miners are allowed to use a wider range of architectures. + +### Evaluation + +The evaluation tasks are the same as the B7_MULTICHOICE competition + +### Definitions + +TODO: Fill in post check-in + # Deprecated Competitions ## Competition 1: SN9_MODEL diff --git a/docs/examples.ipynb b/docs/examples.ipynb index 819cddc..74db6a4 100644 --- a/docs/examples.ipynb +++ b/docs/examples.ipynb @@ -92,8 +92,8 @@ ")\n", "\n", "# Move the model to the appropriate device and set to eval mode.\n", - "model.to(device)\n", - "model.eval()\n", + "model.pt_model.to(device)\n", + "model.pt_model.eval()\n", "\n", "# Load the competition so we can load the right tokenizer.\n", "metagraph = bt.metagraph(constants.SUBNET_UID)\n", @@ -134,7 +134,7 @@ " pad_token_id=tokenizer.eos_token_id,\n", ")\n", "response = ft.eval.method.generate_output(\n", - " model=model,\n", + " model=model.pt_model,\n", " input_ids=input_ids,\n", " generation_config=generation_config,\n", " device=device,\n", diff --git a/finetune/datasets/factory.py b/finetune/datasets/factory.py index d257d18..ab91f73 100644 --- a/finetune/datasets/factory.py +++ b/finetune/datasets/factory.py @@ -1,22 +1,24 @@ +from typing import Any, Dict, Set + from finetune.datasets.generated.dyck_loader import DyckLoader from finetune.datasets.generated.if_eval_loader import IFEvalLoader from finetune.datasets.generated.word_sorting_loader import WordSortingLoader from finetune.datasets.hugging_face.hugging_face_loader import ( - HuggingFaceLoader, FINEWEB_EDU_SCORE_2_NAME, + HuggingFaceLoader, ) from finetune.datasets.ids import DatasetId -from typing import Dict, Any, Set +from finetune.datasets.loader import DatasetLoader -class DatasetLoader: +class DatasetLoaderFactory: @staticmethod def get_loader( dataset_id: DatasetId, dataset_kwargs: Dict[str, Any], seed: int, validator_hotkeys: Set[str], - ) -> "DatasetLoader": + ) -> DatasetLoader: """Loads data samples from the appropriate dataset.""" match dataset_id: diff --git a/finetune/datasets/generated/dyck_loader.py b/finetune/datasets/generated/dyck_loader.py index da4272d..ba52b3f 100644 --- a/finetune/datasets/generated/dyck_loader.py +++ b/finetune/datasets/generated/dyck_loader.py @@ -19,6 +19,8 @@ import torch from transformers import PreTrainedTokenizerBase +from finetune.datasets.loader import DatasetLoader + # Characters to use in the dycks. DYCK_CHARACTER_PAIRS = [("<", ">"), ("[", "]"), ("{", "}"), ("(", ")")] DYCK_ENDING_CHARS = [x[1] for x in DYCK_CHARACTER_PAIRS] @@ -69,7 +71,7 @@ def generate_dyck( return dyck_word -class DyckLoader: +class DyckLoader(DatasetLoader): def __init__( self, dyck_character_pairs: typing.List[ diff --git a/finetune/datasets/generated/if_eval_loader.py b/finetune/datasets/generated/if_eval_loader.py index 225abdb..4ae514a 100644 --- a/finetune/datasets/generated/if_eval_loader.py +++ b/finetune/datasets/generated/if_eval_loader.py @@ -8,12 +8,13 @@ from transformers import PreTrainedTokenizerBase from finetune.datasets.generated.mmlu_parser import extract_q_and_a_text +from finetune.datasets.loader import DatasetLoader from finetune.datasets.subnet.prompting_subset_loader import PromptingSubsetLoader from finetune.eval.if_eval import rule_factory from finetune.eval.if_eval.sample import IFEvalTokenizedSample -class IFEvalLoader: +class IFEvalLoader(DatasetLoader): """Generates samples for the IfEval task.""" # The min/max number of rules per sample. diff --git a/finetune/datasets/generated/word_sorting_loader.py b/finetune/datasets/generated/word_sorting_loader.py index 9707404..575bc22 100644 --- a/finetune/datasets/generated/word_sorting_loader.py +++ b/finetune/datasets/generated/word_sorting_loader.py @@ -20,6 +20,8 @@ import torch from transformers import PreTrainedTokenizerBase +from finetune.datasets.loader import DatasetLoader + try: from nltk.corpus import words except: @@ -28,7 +30,7 @@ WORD_SORTING_CHALLENGE_PROMPT = "Sort the following words alphabetically: " -class WordSortingLoader: +class WordSortingLoader(DatasetLoader): def __init__( self, min_word_count: int = 2, diff --git a/finetune/datasets/hugging_face/hugging_face_loader.py b/finetune/datasets/hugging_face/hugging_face_loader.py index 88efcba..7b2c2ea 100644 --- a/finetune/datasets/hugging_face/hugging_face_loader.py +++ b/finetune/datasets/hugging_face/hugging_face_loader.py @@ -22,12 +22,14 @@ import bittensor as bt from transformers import PreTrainedTokenizerBase +from finetune.datasets.loader import DatasetLoader + FINEWEB_EDU_SCORE_2_NAME = "HuggingFaceFW/fineweb-edu-score-2" FALCON_NAME = "tiiuae/falcon-refinedweb" -class HuggingFaceLoader: +class HuggingFaceLoader(DatasetLoader): rows_base_url: str = "https://datasets-server.huggingface.co/rows" size_base_url: str = "https://datasets-server.huggingface.co/size" diff --git a/finetune/datasets/loader.py b/finetune/datasets/loader.py new file mode 100644 index 0000000..115fd98 --- /dev/null +++ b/finetune/datasets/loader.py @@ -0,0 +1,24 @@ +import abc +from typing import List + +from transformers import PreTrainedTokenizerBase + +from finetune.eval.sample import EvalSample + + +class DatasetLoader(abc.ABC): + """Base class for dataset loaders.""" + + @abc.abstractmethod + def tokenize( + self, tokenizer: PreTrainedTokenizerBase, sequence_length: int + ) -> List[EvalSample]: + pass + + @abc.abstractmethod + def __iter__(self): + pass + + @abc.abstractmethod + def __len__(self): + pass diff --git a/finetune/datasets/subnet/prompting_subset_loader.py b/finetune/datasets/subnet/prompting_subset_loader.py index e9a7f40..0fe2360 100644 --- a/finetune/datasets/subnet/prompting_subset_loader.py +++ b/finetune/datasets/subnet/prompting_subset_loader.py @@ -27,6 +27,7 @@ from transformers import PreTrainedTokenizerBase import constants +from finetune.datasets.loader import DatasetLoader from finetune.datasets.subnet.history_scan import SampledHistoryScan # Multiple choice answers for the prompting subnet. @@ -36,7 +37,7 @@ EARLIEST_DATE = dt.datetime(2024, 8, 29, tzinfo=dt.timezone.utc) -class PromptingSubsetLoader: +class PromptingSubsetLoader(DatasetLoader): @staticmethod def _get_filters( validator_hotkeys: typing.List[str], diff --git a/finetune/mining.py b/finetune/mining.py index 5f60c41..b5c35e1 100644 --- a/finetune/mining.py +++ b/finetune/mining.py @@ -34,7 +34,7 @@ from taoverse.model.storage.model_metadata_store import ModelMetadataStore from taoverse.model.storage.remote_model_store import RemoteModelStore from taoverse.model.utils import get_hash_of_two_strings -from transformers import AutoModelForCausalLM, PreTrainedModel +from transformers import AutoModelForCausalLM, AutoTokenizer import constants import finetune as ft @@ -49,7 +49,7 @@ def model_path(base_dir: str, run_id: str) -> str: async def push( - model: PreTrainedModel, + model: Model, repo: str, competition_id: CompetitionId, wallet: bt.wallet, @@ -61,7 +61,7 @@ async def push( """Pushes the model to Hugging Face and publishes it on the chain for evaluation by validators. Args: - model (PreTrainedModel): The model to push. + model (Model): The model to push. ModelId is overwritten based on the other parameters. repo (str): The repo to push to. Must be in format "namespace/name". competition_id (CompetitionId): The competition the miner is participating in. wallet (bt.wallet): The wallet of the Miner uploading the model. @@ -89,24 +89,28 @@ async def push( # First upload the model to HuggingFace. namespace, name = model_utils.validate_hf_repo_id(repo) - model_id = ModelId(namespace=namespace, name=name, competition_id=competition_id) - model_id = await remote_model_store.upload_model( - Model(id=model_id, pt_model=model), model_constraints - ) + # Overwrite the model id with the current information. + model.id = ModelId(namespace=namespace, name=name, competition_id=competition_id) + # Get the new model id which includes hash information. + model_id_with_hash = await remote_model_store.upload_model(model, model_constraints) bt.logging.success("Uploaded model to hugging face.") - secure_hash = get_hash_of_two_strings(model_id.hash, wallet.hotkey.ss58_address) - model_id = replace(model_id, secure_hash=secure_hash) + secure_hash = get_hash_of_two_strings( + model_id_with_hash.hash, wallet.hotkey.ss58_address + ) + model_id_with_hash = replace(model_id_with_hash, secure_hash=secure_hash) - bt.logging.success(f"Now committing to the chain with model_id: {model_id}") + bt.logging.success( + f"Now committing to the chain with model_id: {model_id_with_hash}" + ) # We can only commit to the chain every 20 minutes, so run this in a loop, until # successful. while True: try: await metadata_store.store_model_metadata( - wallet.hotkey.ss58_address, model_id + wallet.hotkey.ss58_address, model_id_with_hash ) bt.logging.info( @@ -119,13 +123,14 @@ async def push( if ( not model_metadata - or model_metadata.id.to_compressed_str() != model_id.to_compressed_str() + or model_metadata.id.to_compressed_str() + != model_id_with_hash.to_compressed_str() ): bt.logging.error( - f"Failed to read back model metadata from the chain. Expected: {model_id}, got: {model_metadata}" + f"Failed to read back model metadata from the chain. Expected: {model_id_with_hash}, got: {model_metadata}" ) raise ValueError( - f"Failed to read back model metadata from the chain. Expected: {model_id}, got: {model_metadata}" + f"Failed to read back model metadata from the chain. Expected: {model_id_with_hash}, got: {model_metadata}" ) bt.logging.success("Committed model to the chain.") @@ -147,17 +152,23 @@ async def push( bt.logging.success("Model set to public") -def save(model: PreTrainedModel, model_dir: str): +def save(model: Model, model_dir: str): """Saves a model to the provided directory""" if not os.path.exists(model_dir): os.makedirs(model_dir, exist_ok=True) # Save the model state to the specified path. - model.save_pretrained( + model.pt_model.save_pretrained( save_directory=model_dir, safe_serialization=True, ) + if model.tokenizer is not None: + model.tokenizer.save_pretrained( + save_directory=model_dir, + safe_serialization=True, + ) + async def get_repo( uid: int, @@ -181,15 +192,35 @@ async def get_repo( return model_utils.get_hf_url(model_metadata) -def load_local_model(model_dir: str, kwargs: Dict[str, Any]) -> PreTrainedModel: +def load_local_model(model_dir: str, kwargs: Dict[str, Any]) -> Model: """Loads a model from a directory.""" - return AutoModelForCausalLM.from_pretrained( + model_id = ModelId( + namespace="local_namespace", + name="local_model", + competition_id=CompetitionId.NONE, + ) + + pt_model = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path=model_dir, local_files_only=True, use_safetensors=True, **kwargs, ) + # Always try to retrieve a tokenizer from the model directory. If we do not find one leave it None on the Model. + tokenizer = None + try: + # Do not use the kwargs for the model load here. If needed in the future a separate kwargs can be plumbed. + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_dir, + local_files_only=True, + use_safetensors=True, + ) + except Exception: + pass + + return Model(id=model_id, pt_model=pt_model, tokenizer=tokenizer) + async def load_remote_model( uid: int, @@ -197,7 +228,7 @@ async def load_remote_model( metagraph: Optional[bt.metagraph] = None, metadata_store: Optional[ModelMetadataStore] = None, remote_model_store: Optional[RemoteModelStore] = None, -) -> PreTrainedModel: +) -> Model: """Loads the model currently being advertised by the Miner with the given UID. Args: @@ -235,7 +266,7 @@ async def load_remote_model( model: Model = await remote_model_store.download_model( model_metadata.id, download_dir, model_constraints ) - return model.pt_model + return model async def load_best_model( @@ -244,7 +275,7 @@ async def load_best_model( metagraph: Optional[bt.metagraph] = None, metadata_store: Optional[ModelMetadataStore] = None, remote_model_store: Optional[RemoteModelStore] = None, -) -> PreTrainedModel: +) -> Model: """Loads the model from the best performing miner to download_dir""" best_uid = ft.graph.best_uid(competition_id=competition_id) if best_uid is None: diff --git a/finetune/validation.py b/finetune/validation.py index 19bd835..48b5209 100644 --- a/finetune/validation.py +++ b/finetune/validation.py @@ -23,19 +23,19 @@ import bittensor as bt import torch -import transformers from taoverse.model.competition.data import Competition from taoverse.model.competition.epsilon import EpsilonFunc +from taoverse.model.data import Model from taoverse.model.eval.normalization import normalize_score from taoverse.model.eval.task import EvalTask from transformers import GenerationConfig from finetune.eval.method import ( EvalMethodId, + compute_if_eval, compute_multiple_choice_deviation, compute_reference_loss, compute_text_loss, - compute_if_eval, ) from finetune.eval.sample import EvalSample @@ -134,8 +134,7 @@ class ScoreDetails: def score_model( - model, - tokenizer: transformers.PreTrainedTokenizer, + model: Model, evals: typing.List[EvalTask], samples: typing.List[typing.List[EvalSample]], competition: Competition, @@ -145,7 +144,6 @@ def score_model( Args: model (torch.nn.Module): The model to score. - tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for tokenization. evals (list): A list of EvalTasks to score the model on. samples (list): A list of samples to use for scoring for the eval tasks. Must be the same length as evals. competition (Competition): The competition to score the model for. @@ -157,12 +155,16 @@ def score_model( if len(evals) != len(samples): raise ValueError("Number of eval tasks and samples must match.") + if not model.tokenizer: + raise ValueError("Model does not have a tokenizer") + with torch.inference_mode(): - model.to(device) - model.eval() + model.pt_model.to(device) + model.pt_model.eval() score = 0 score_details = {task.name: ScoreDetails() for task in evals} + tokenizer = model.tokenizer for task, samples in zip(evals, samples): bt.logging.trace(f"Scoring model on task: {task.name}") @@ -177,7 +179,7 @@ def score_model( pad_token_id=tokenizer.eos_token_id, ) raw_score = compute_multiple_choice_deviation( - model=model, + model=model.pt_model, tokenizer=tokenizer, generation_config=compute_mc_generation_config, batches=samples, @@ -185,13 +187,13 @@ def score_model( ) case EvalMethodId.REFERENCE_LOSS: raw_score = compute_reference_loss( - model=model, + model=model.pt_model, batches=samples, device=device, ) case EvalMethodId.TEXT_LOSS: raw_score = compute_text_loss( - model=model, + model=model.pt_model, batches=samples, device=device, pad_token_id=tokenizer.eos_token_id, @@ -206,7 +208,7 @@ def score_model( max_time=5.0, ) raw_score = compute_if_eval( - model=model, + model=model.pt_model, tokenizer=tokenizer, generation_config=compute_if_generation_config, batches=samples, diff --git a/neurons/miner.py b/neurons/miner.py index e603ee4..b126c68 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -20,7 +20,6 @@ import datetime as dt import math import os -import random import typing import bittensor as bt @@ -28,6 +27,7 @@ import wandb from dotenv import load_dotenv from taoverse.metagraph import utils as metagraph_utils +from taoverse.model.data import Model from taoverse.model.storage.chain.chain_model_metadata_store import ( ChainModelMetadataStore, ) @@ -35,13 +35,10 @@ HuggingFaceModelStore, ) from taoverse.model.storage.model_metadata_store import ModelMetadataStore -from taoverse.utilities import utils from taoverse.utilities import wandb as wandb_utils -from transformers import PreTrainedModel import constants import finetune as ft -from finetune.datasets.subnet.prompting_subset_loader import PromptingSubsetLoader from neurons import config as neuron_config load_dotenv() # take environment variables from .env. @@ -54,7 +51,7 @@ async def load_starting_model( metagraph: bt.metagraph, metadata_store: ModelMetadataStore, kwargs: typing.Dict[str, typing.Any], -) -> PreTrainedModel: +) -> Model: """Loads the model to train based on the provided config.""" # Initialize the model based on the best on the network. diff --git a/neurons/validator.py b/neurons/validator.py index bba3884..2934628 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -18,13 +18,15 @@ # Due to the implementation of disable_progress_bars(), this has to be the first import+call in the application relating to huggingface import dataclasses +import logging from huggingface_hub.utils import disable_progress_bars from retry import retry from taoverse.model.eval.task import EvalTask -from finetune.datasets.factory import DatasetLoader +from finetune.datasets.factory import DatasetLoaderFactory from finetune.datasets.ids import DatasetId +from finetune.datasets.loader import DatasetLoader from finetune.eval.sample import EvalSample from finetune.validation import ScoreDetails @@ -49,6 +51,8 @@ import nltk import torch import wandb +from bittensor.utils.btlogging.defines import BITTENSOR_LOGGER_NAME +from bittensor.utils.btlogging.helpers import all_loggers from dotenv import load_dotenv from rich.console import Console from rich.table import Table @@ -125,8 +129,16 @@ def state_path(self) -> str: def __init__(self): self.config = neuron_config.validator_config() + # Manually default to info before overriding with arguments. + # If this is not done then info logging does not work in the cases where other modes are not specified. + bt.logging.set_info() bt.logging(config=self.config) + # Setting logging level on bittensor messes with all loggers, which we don't want, so set explicitly to warning here. + for logger in all_loggers(): + if not logger.name.startswith(BITTENSOR_LOGGER_NAME): + logger.setLevel(logging.WARNING) + bt.logging.info(f"Starting validator with config: {self.config}") # === Bittensor objects ==== @@ -200,7 +212,7 @@ def __init__(self): self._new_wandb_run() # === Running args === - self.weights = torch.zeros_like(self.metagraph.S) + self.weights = torch.zeros_like(torch.from_numpy(self.metagraph.S)) self.global_step = 0 self.last_epoch = self.metagraph.block.item() @@ -408,6 +420,9 @@ def update_models(self): # Track how recently we checked the list of top models. last_checked_top_models_time = None + # Delay the first update loop until the metagraph has been synced. + time.sleep(60) + # The below loop iterates across all miner uids and checks to see # if they should be updated. while not self.stop_event.is_set(): @@ -715,7 +730,7 @@ async def _try_set_weights(): netuid=self.config.netuid, wallet=self.wallet, uids=uids, - weights=self.weights, + weights=self.weights.numpy(), wait_for_inclusion=False, version_key=constants.weights_version_key, ) @@ -729,15 +744,6 @@ async def _try_set_weights(): except: bt.logging.warning("Failed to set weights. Trying again later.") - ws, ui = self.weights.topk(len(self.weights)) - table = Table(title="All Weights") - table.add_column("uid", justify="right", style="cyan", no_wrap=True) - table.add_column("weight", style="magenta") - for index, weight in list(zip(ui.tolist(), ws.tolist())): - table.add_row(str(index), str(round(weight, 4))) - console = Console() - console.print(table) - try: bt.logging.debug(f"Setting weights.") await asyncio.wait_for(_try_set_weights(), ttl) @@ -747,8 +753,13 @@ async def _try_set_weights(): def _get_current_block(self) -> int: """Returns the current block.""" - try: + + @retry(tries=5, delay=1, backoff=2) + def _get_block_with_retry(): return self.subtensor.block + + try: + return _get_block_with_retry() except: bt.logging.debug( "Failed to get the latest block from the chain. Using the block from the cached metagraph." @@ -945,13 +956,17 @@ async def run_step(self): # Pull the latest sample data based on the competition. load_data_perf = PerfMonitor("Eval: Load data") - # Tokenize the data into batches for use in evaluation. - # If custom tokenizers are allowed this will need to be done on a per uid basis instead. - tokenizer = ft.model.load_tokenizer( - competition.constraints, cache_dir=self.config.model_dir - ) + + use_default_tokenizer = False + if competition.constraints.tokenizer: + tokenizer = ft.model.load_tokenizer( + competition.constraints, cache_dir=self.config.model_dir + ) + use_default_tokenizer = True + seed = self._get_seed(sync_block) eval_tasks: typing.List[EvalTask] = [] + data_loaders: typing.List[DatasetLoader] = [] samples: typing.List[typing.List[EvalSample]] = [] # Load data based on the competition. @@ -973,7 +988,7 @@ async def run_step(self): vali_hotkeys, ) else: - data_loader = DatasetLoader.get_loader( + data_loader = DatasetLoaderFactory.get_loader( dataset_id=eval_task.dataset_id, dataset_kwargs=eval_task.dataset_kwargs, seed=seed, @@ -982,11 +997,14 @@ async def run_step(self): if data_loader: eval_tasks.append(eval_task) - samples.append( - data_loader.tokenize( - tokenizer, competition.constraints.sequence_length + data_loaders.append(data_loader) + if use_default_tokenizer: + assert tokenizer + samples.append( + data_loader.tokenize( + tokenizer, competition.constraints.sequence_length + ) ) - ) # Compute model score on batches. bt.logging.debug( @@ -1035,13 +1053,29 @@ async def run_step(self): hotkey, model_i_metadata.id, kwargs ) + # If the competition defines a default tokenizer, set it here. + if use_default_tokenizer: + model_i.tokenizer = tokenizer + else: + if not model_i.tokenizer: + raise ValueError( + f"Model {uid_i} does not have a tokenizer." + ) + + samples = [ + loader.tokenize( + model_i.tokenizer, + competition.constraints.sequence_length, + ) + for loader in data_loaders + ] + with compute_score_perf.sample(): # Run each computation in a subprocess so that the GPU is reset between each model. score, score_details = utils.run_in_subprocess( functools.partial( ft.validation.score_model, - model_i.pt_model, - tokenizer, + model_i, eval_tasks, samples, competition, @@ -1090,7 +1124,7 @@ async def run_step(self): # Fill in metagraph sized tensor with the step weights of the evaluated models. with self.metagraph_lock: - competition_weights = torch.zeros_like(self.metagraph.S) + competition_weights = torch.zeros_like(torch.from_numpy(self.metagraph.S)) for i, uid_i in enumerate(uids): competition_weights[uid_i] = step_weights[i] @@ -1105,8 +1139,10 @@ async def run_step(self): # Align competition_tracker to only track active competitions. self.competition_tracker.reset_competitions(active_competition_ids) # Update self.weights to the merged values across active competitions. - self.weights = self.competition_tracker.get_subnet_weights(competition_schedule) - self.weights[self.weights < constants.MIN_WEIGHT_THRESHOLD] = 0.0 + self.weights = self.competition_tracker.get_subnet_weights( + competitions=competition_schedule, + min_comp_weight_threshold=constants.MIN_WEIGHT_THRESHOLD, + ) # Prioritize models for keeping up to the sample_min for the next eval loop. # If the model has any significant weight, prioritize by weight with greater weights being kept first. @@ -1300,13 +1336,13 @@ def log_step( console.print(table) ws, ui = self.weights.topk(len(self.weights)) - table = Table(title=f"Weights >= {constants.MIN_WEIGHT_THRESHOLD}") + table = Table(title=f"Weights >= {constants.WEIGHT_SYNC_MINER_MIN_PERCENT}") table.add_column("uid", justify="right", style="cyan", no_wrap=True) table.add_column("weight", style="magenta") table.add_column("comp", style="magenta") for index, weight in list(zip(ui.tolist(), ws.tolist())): - # All remaining weights should be above the threshold so this check mainly filters out 0s. - if weight >= constants.MIN_WEIGHT_THRESHOLD: + # Show anything with weight high enough to be considered for top model checks. + if weight >= constants.WEIGHT_SYNC_MINER_MIN_PERCENT: table.add_row( str(index), str(round(weight, 4)), str(uid_to_competition_id[index]) ) diff --git a/requirements.txt b/requirements.txt index a612438..081e5b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -bittensor==6.9.4 +bittensor==8.4.3 huggingface_hub nltk -numpy==1.26.4 +numpy==2.0.2 python-dotenv rich safetensors torch==2.3.1 transformers==4.44.1 wandb==0.18.0 -taoverse==1.1.1 +taoverse==1.3.1 diff --git a/scripts/model_validation.py b/scripts/model_validation.py index 9d35cd9..6ed0506 100644 --- a/scripts/model_validation.py +++ b/scripts/model_validation.py @@ -4,44 +4,25 @@ import argparse import datetime as dt -import math import random import sys +from typing import List +import bittensor as bt +import nltk +from taoverse.metagraph import utils as metagraph_utils from taoverse.model.competition import utils as competition_utils -from taoverse.model.data import Model, ModelId +from taoverse.model.eval.task import EvalTask from taoverse.model.model_updater import ModelUpdater from taoverse.utilities.enum_action import IntEnumAction -from taoverse.utilities.perf_monitor import PerfMonitor -from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig import constants import finetune as ft from competitions.data import CompetitionId +from finetune.datasets.factory import DatasetLoaderFactory +from finetune.datasets.ids import DatasetId from finetune.datasets.subnet.prompting_subset_loader import PromptingSubsetLoader -from finetune.eval.method import compute_multiple_choice_deviation - - -def load_model(model_path, competition_id, allow_remote_code, kwargs) -> Model: - model_id = ModelId( - namespace="namespace", name="name", competition_id=competition_id - ) - if allow_remote_code: - pt_model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path=model_path, - trust_remote_code=True, - use_safetensors=True, - **kwargs, - ) - else: - pt_model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path=model_path, - local_files_only=True, - use_safetensors=True, - **kwargs, - ) - - return Model(id=model_id, pt_model=pt_model) +from finetune.eval.sample import EvalSample def main(): @@ -61,12 +42,6 @@ def main(): default=0, help="Random seed to use while loading data. If 0 then randomize.", ) - parser.add_argument( - "--latest_prompting_samples", - type=int, - default=400, - help="Number of most recent prompting samples to eval against", - ) parser.add_argument( "--competition_id", type=CompetitionId, @@ -74,30 +49,9 @@ def main(): action=IntEnumAction, help="competition to mine for (use --list-competitions to get all competitions)", ) - parser.add_argument( - "--allow_remote_code", - action="store_true", - help="If a remote code should be allowed", - ) - parser.add_argument( - "--skip_constraints_check", - action="store_true", - help="If the competition constraints check should be skipped", - ) parser.add_argument( "--list_competitions", action="store_true", help="Print out all competitions" ) - parser.add_argument( - "--tokenizer_override", - action="store_true", - help="If a custom tokenizer should be used rather than the competition one", - ) - parser.add_argument( - "--tokenizer", - type=str, - default="Xenova/gpt-4", - help="Tokenizer", - ) parser.add_argument( "--comp_block", type=int, @@ -106,7 +60,11 @@ def main(): ) args = parser.parse_args() if args.list_competitions: - print(constants.COMPETITION_SCHEDULE_BY_BLOCK) + print( + competition_utils.get_competition_schedule_for_block( + args.comp_block, constants.COMPETITION_SCHEDULE_BY_BLOCK + ) + ) return competition = competition_utils.get_competition_for_block( @@ -115,90 +73,81 @@ def main(): constants.COMPETITION_SCHEDULE_BY_BLOCK, ) + if not competition: + print(f"Competition {args.competition_id} not found.") + return + kwargs = competition.constraints.kwargs.copy() kwargs["use_cache"] = True - print(f"Loading model for competition {args.competition_id}") - load_model_perf = PerfMonitor("Eval: Load model") - with load_model_perf.sample(): - model = load_model( - args.model_path, competition.id, args.allow_remote_code, kwargs - ) - print(load_model_perf.summary_str()) + print(f"Loading tokenizer and model from {args.model_path}") + model = ft.mining.load_local_model(args.model_path, kwargs) - if not args.skip_constraints_check: - if not ModelUpdater.verify_model_satisfies_parameters( - model, competition.constraints - ): - print("Model does not satisfy competition parameters!!!") - return + if competition.constraints.tokenizer: + model.tokenizer = ft.model.load_tokenizer(competition.constraints) - pull_data_perf = PerfMonitor("Eval: Pull data") - sample_data = None + if not ModelUpdater.verify_model_satisfies_parameters( + model, competition.constraints + ): + print("Model does not satisfy competition parameters!!!") + return seed = args.random_seed if args.random_seed else random.randint(0, sys.maxsize) - if args.competition_id == CompetitionId.B7_MULTI_CHOICE: - print("Getting latest sample data from prompting.") - with pull_data_perf.sample(): - sample_data = PromptingSubsetLoader( + print("Loading evaluation tasks") + eval_tasks: List[EvalTask] = [] + samples: List[List[EvalSample]] = [] + + # Load data based on the competition. + metagraph = bt.metagraph(constants.PROMPTING_SUBNET_UID) + vali_uids = metagraph_utils.get_high_stake_validators( + metagraph, constants.SAMPLE_VALI_MIN_STAKE + ) + vali_hotkeys = set([metagraph.hotkeys[uid] for uid in vali_uids]) + + for eval_task in competition.eval_tasks: + if eval_task.dataset_id == DatasetId.SYNTHETIC_MMLU: + data_loader = PromptingSubsetLoader( random_seed=seed, - max_samples=args.latest_prompting_samples, - oldest_sample_timestamp=dt.datetime.now() - dt.timedelta(hours=4), + oldest_sample_timestamp=dt.datetime.now(dt.timezone.utc) + - dt.timedelta(hours=6), + validator_hotkeys=vali_hotkeys, ) - else: - print( - f"Competition id: {args.competition_id} has no sample loading logic specified." - ) - return - print(pull_data_perf.summary_str()) - - print("Tokenizing sample data") - if args.tokenizer_override: - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.allow_remote_code - ) - else: - tokenizer = ft.model.load_tokenizer(competition.constraints) - batches = sample_data.tokenize(tokenizer, competition.constraints.sequence_length) - - print("Calculating deviations") - compute_deviation_perf = PerfMonitor("Eval: Compute deviation") - - if args.competition_id == CompetitionId.B7_MULTI_CHOICE: - # Please note, this currently does not include other evaluations that may - # be run as part of the competition. - # These will be included in a future release. - generation_config = GenerationConfig( - max_new_tokens=20, - max_length=competition.constraints.sequence_length, - do_sample=False, - repetition_penalty=1.1, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.eos_token_id, - ) - with compute_deviation_perf.sample(): - deviations = compute_multiple_choice_deviation( - model.pt_model, - tokenizer, - generation_config, - batches, - device=args.device, + else: + data_loader = DatasetLoaderFactory.get_loader( + dataset_id=eval_task.dataset_id, + dataset_kwargs=eval_task.dataset_kwargs, + seed=seed, + validator_hotkeys=vali_hotkeys, ) - else: - print( - f"Competition id: {args.competition_id} has no evaluation logic specified." - ) - return - print(compute_deviation_perf.summary_str()) + if data_loader: + eval_tasks.append(eval_task) + print(f"Loaded {len(data_loader)} samples for task {eval_task.name}") + samples.append( + data_loader.tokenize( + model.tokenizer, competition.constraints.sequence_length + ) + ) - average_model_deviation = ( - sum(deviations) / len(deviations) if len(deviations) > 0 else math.inf + print(f"Scoring model on tasks {eval_tasks}") + # Run each computation in a subprocess so that the GPU is reset between each model. + score, score_details = ft.validation.score_model( + model, + eval_tasks, + samples, + competition, + args.device, ) - print(f"The average deviation for {args.model_path} is {average_model_deviation}") + print(f"Computed score: {score}. Details: {score_details}") if __name__ == "__main__": + # Make sure we can download the needed ntlk modules + # Used for generating words in word sorting evals + nltk.download("words", raise_on_error=True) + # Used for counting sentences in sentence count evals + nltk.download("punkt", raise_on_error=True) + main() diff --git a/scripts/run_benchmarks.py b/scripts/run_benchmarks.py index 5ef42ea..23debb7 100644 --- a/scripts/run_benchmarks.py +++ b/scripts/run_benchmarks.py @@ -3,6 +3,7 @@ import dataclasses import json import os +import pickle import shutil import time import traceback @@ -12,7 +13,9 @@ import dotenv import lm_eval import wandb +from huggingface_hub import login from lm_eval.models.huggingface import HFLM +from taoverse.model import utils as model_utils from taoverse.model.competition import utils as competition_utils from taoverse.model.competition.data import Competition from taoverse.model.data import ModelMetadata @@ -23,13 +26,9 @@ HuggingFaceModelStore, ) from transformers import AutoTokenizer -from taoverse.model import utils as model_utils -from huggingface_hub import login - -from utils import benchmark_helpers import constants -import pickle +from utils import benchmark_helpers class CompletedEvalStore: @@ -120,7 +119,11 @@ def _run_benchmarks( "leaderboard_bbh", "leaderboard_gpqa", "leaderboard_ifeval", + "leaderboard_musr", "mmlu", + "agieval_en", + "arc_challenge", + "gsm8k_cot", ], verbosity="DEBUG", batch_size="auto", @@ -128,12 +131,12 @@ def _run_benchmarks( ) -def save_state(state: CompletedEvalStore.State, filepath: str): +def save_state(state: Dict[int, CompletedEvalStore.State], filepath: str): with open(filepath, "wb") as f: pickle.dump(state, f) -def load_state(filepath: str) -> CompletedEvalStore.State: +def load_state(filepath: str) -> Dict[int, CompletedEvalStore.State]: with open(filepath, "rb") as f: return pickle.load(f) @@ -197,9 +200,9 @@ def main(args: argparse.Namespace): step = 0 # Load state from previous runs. - last_model = None + last_model_per_comp = {} try: - last_model = load_state(args.file) + last_model_per_comp = load_state(args.file) except FileNotFoundError: pass @@ -230,7 +233,7 @@ def main(args: argparse.Namespace): repo=f"{model_metadata.id.namespace}/{model_metadata.id.name}", commit=model_metadata.id.commit, ) - if state == last_model: + if state == last_model_per_comp.get(competition.id, None): print( f"Model {state.repo} at commit {state.commit} has already been benchmarked." ) @@ -264,14 +267,15 @@ def main(args: argparse.Namespace): "uid": uid, "model": model_utils.get_hf_url(model_metadata), "block": model_metadata.block, + "competition_id": competition.id, }, allow_val_change=True, ) wandb_run.log(results | lb_results) wandb_run.finish() - last_model = state - save_state(last_model, args.file) + last_model_per_comp[competition.id] = state + save_state(last_model_per_comp, args.file) if step % 50: print("Deleting HF cache.") diff --git a/tests/finetune/datasets/generated/test_if_eval_loader.py b/tests/finetune/datasets/generated/test_if_eval_loader.py index f28225c..d41b385 100644 --- a/tests/finetune/datasets/generated/test_if_eval_loader.py +++ b/tests/finetune/datasets/generated/test_if_eval_loader.py @@ -8,7 +8,7 @@ class TestIFEvalLoader(unittest.TestCase): def setUp(self): - self.loader = IFEvalLoader(random_seed=42, max_samples=100) + self.loader = IFEvalLoader(random_seed=420, max_samples=100) def test_uniform_distribution_of_rules(self): rule_counts = [len(sample.rules) for sample in self.loader] diff --git a/tests/finetune/test_mining.py b/tests/finetune/test_mining.py index 906aba7..18caf39 100644 --- a/tests/finetune/test_mining.py +++ b/tests/finetune/test_mining.py @@ -8,13 +8,14 @@ import bittensor as bt import torch from taoverse.model.data import Model, ModelId +from transformers import AutoTokenizer import constants import finetune as ft from competitions.data import CompetitionId from tests.model.storage.fake_model_metadata_store import FakeModelMetadataStore from tests.model.storage.fake_remote_model_store import FakeRemoteModelStore -from tests.utils import assert_model_equality, get_test_model +from tests.utils import get_test_model class TestMining(unittest.TestCase): @@ -43,18 +44,57 @@ def tearDown(self): def test_model_to_disk_roundtrip(self): """Tests that saving a model to disk and loading it gets the same model.""" + # Use the default model id for local models. + model_id = ModelId( + namespace="local_namespace", + name="local_model", + competition_id=CompetitionId.NONE, + ) + model = Model(id=model_id, pt_model=self.tiny_model, tokenizer=None) + + ft.mining.save(model=model, model_dir=self.model_dir) + retrieved_model = ft.mining.load_local_model( + model_dir=self.model_dir, kwargs={} + ) + + self.assertEqual(str(model), str(retrieved_model)) - ft.mining.save(model=self.tiny_model, model_dir=self.model_dir) - model = ft.mining.load_local_model(model_dir=self.model_dir, kwargs={}) + def test_model_with_tokenizer_to_disk_roundtrip(self): + """Tests that saving a model with tokenizer to disk and loading it gets the same model.""" + # Use the default model id for local models. + model_id = ModelId( + namespace="local_namespace", + name="local_model", + competition_id=CompetitionId.NONE, + ) + tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4") + model = Model(id=model_id, pt_model=self.tiny_model, tokenizer=tokenizer) + + ft.mining.save(model=model, model_dir=self.model_dir) + retrieved_model = ft.mining.load_local_model( + model_dir=self.model_dir, kwargs={} + ) - assert_model_equality(self, self.tiny_model, model) + # Overwrite the name of the tokenizer to avoid it using the local path. + retrieved_model.tokenizer.name_or_path = "Xenova/gpt-4" + self.assertEqual(str(model), str(retrieved_model)) def _test_push( - self, min_expected_block: int = 1, competition_id=CompetitionId.B7_MULTI_CHOICE + self, + min_expected_block: int = 1, + competition_id=CompetitionId.B7_MULTI_CHOICE, + tokenizer=None, ): + model_id = ModelId( + namespace="namespace", + name="name", + competition_id=competition_id, + ) + model = Model(id=model_id, pt_model=self.tiny_model, tokenizer=tokenizer) + asyncio.run( ft.mining.push( - model=self.tiny_model, + model=model, wallet=self.wallet, competition_id=competition_id, repo="namespace/name", @@ -66,8 +106,10 @@ def _test_push( ) # Check that the model was uploaded to hugging face. - model: Model = self.remote_store.get_only_model() - assert_model_equality(self, self.tiny_model, model.pt_model) + retrieved_model: Model = self.remote_store.get_only_model() + # Align the model id with the retrieved model as the hash and such will change. + model.id = retrieved_model.id + self.assertEqual(str(model), str(retrieved_model)) # Check that the model ID was published on the chain. model_metadata = asyncio.run( @@ -76,10 +118,12 @@ def _test_push( self.assertGreaterEqual(model_metadata.block, min_expected_block) # Check certain properties of the model metadata. - self.assertEqual(model_metadata.id.commit, model.id.commit) - self.assertEqual(model_metadata.id.name, model.id.name) - self.assertEqual(model_metadata.id.namespace, model.id.namespace) - self.assertEqual(model_metadata.id.competition_id, model.id.competition_id) + self.assertEqual(model_metadata.id.commit, retrieved_model.id.commit) + self.assertEqual(model_metadata.id.name, retrieved_model.id.name) + self.assertEqual(model_metadata.id.namespace, retrieved_model.id.namespace) + self.assertEqual( + model_metadata.id.competition_id, retrieved_model.id.competition_id + ) self.metadata_store.reset() self.remote_store.reset() @@ -88,6 +132,11 @@ def test_push_success(self): """Tests that pushing a model to the chain is successful.""" self._test_push() + def test_push_success_tokenizer(self): + """Tests that pushing a model with a tokenizer to the chain is successful.""" + tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4") + self._test_push(tokenizer=tokenizer) + def test_push_model_chain_failure(self): """Tests that pushing a model is eventually successful even if pushes to the chain fail.""" @@ -166,15 +215,13 @@ async def test_load_best_model(self): # Upload the model for miner 1. model_store = FakeRemoteModelStore() - model = self._get_model() + model = self.tiny_model await model_store.upload_model( Model( id=miner_1_model_id, pt_model=model, ), - model_constraints=constants.MODEL_CONSTRAINTS_BY_COMPETITION_ID.get( - 1, None - ), + competition=CompetitionId.SN9_MODEL, ) # Verify that miner 1's model is loaded. diff --git a/tests/utils.py b/tests/utils.py index 4563291..75ec0f3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,14 +3,6 @@ from transformers import LlamaConfig, LlamaForCausalLM, PreTrainedModel -def assert_model_equality( - test_case: unittest.TestCase, model1: PreTrainedModel, model2: PreTrainedModel -): - """Checks if two models are equal.""" - test_case.assertEqual(type(model1), type(model2)) - test_case.assertEqual(str(model1.state_dict()), str(model2.state_dict())) - - def get_test_model() -> PreTrainedModel: """Gets a test model that is small enough to load and store quickly. diff --git a/utils/benchmark_helpers.py b/utils/benchmark_helpers.py index a8f91be..502c896 100644 --- a/utils/benchmark_helpers.py +++ b/utils/benchmark_helpers.py @@ -296,6 +296,6 @@ def get_leaderboard_scores(results: Dict[str, Any]) -> Dict[str, float]: "bbh": compute_bbh(results), # "math": compute_math(results), "gpqa": compute_gpqa(results), - # "musr": compute_musr(results), + "musr": compute_musr(results), "mmlu_pro": compute_mmlu_pro(results), }