Nit improvements in model loading

cbalioglu · cbalioglu · commit 343be99dd726 · 2025-02-23T00:04:42.000Z
diff --git a/src/fairseq2/assets/_download_manager.py b/src/fairseq2/assets/_download_manager.py
@@ -37,7 +37,7 @@ def download_checkpoint(
         uri: str,
         model_name: str,
         *,
-        shard_idx: int | None = None,
+        shard_idx: int,
         force: bool = False,
         progress: bool = True,
     ) -> Path:
@@ -62,18 +62,15 @@ def download_checkpoint(
     def download_tokenizer(
         self,
         uri: str,
-        model_name: str,
+        tokenizer_name: str,
         *,
-        tokenizer_name: str | None = None,
         force: bool = False,
         progress: bool = True,
     ) -> Path:
         """Download the tokenizer at ``uri`` to the asset cache directory.
 
         :param uri:
             The URI to download from.
-        :param model_name:
-            The name of the associated model.
         :param tokenizer_name:
             The name of the tokenizer.
         :param force:
@@ -129,15 +126,12 @@ def download_checkpoint(
         uri: str,
         model_name: str,
         *,
-        shard_idx: int | None = None,
+        shard_idx: int = 0,
         force: bool = False,
         progress: bool = True,
     ) -> Path:
         display_name = f"checkpoint of {model_name}"
 
-        if shard_idx is not None:
-            display_name = f"{display_name} (shard {shard_idx})"
-
         op = _AssetDownloadOp(
             self._cache_dir, uri, display_name, force, progress, shard_idx
         )
@@ -148,16 +142,12 @@ def download_checkpoint(
     def download_tokenizer(
         self,
         uri: str,
-        model_name: str,
+        tokenizer_name: str,
         *,
-        tokenizer_name: str | None = None,
         force: bool = False,
         progress: bool = True,
     ) -> Path:
-        if not tokenizer_name:
-            display_name = f"tokenizer of {model_name}"
-        else:
-            display_name = f"{tokenizer_name} tokenizer of {model_name}"
+        display_name = f"{tokenizer_name} tokenizer"
 
         op = _AssetDownloadOp(self._cache_dir, uri, display_name, force, progress)
 
@@ -187,7 +177,7 @@ class _AssetDownloadOp:
     _display_name: str
     _force: bool
     _progress: bool
-    _shard_idx: int | None
+    _shard_idx: int
 
     def __init__(
         self,
@@ -196,7 +186,7 @@ def __init__(
         display_name: str,
         force: bool,
         progress: bool,
-        shard_idx: int | None = None,
+        shard_idx: int = 0,
     ) -> None:
         self._cache_dir = cache_dir
         self._uri = uri
@@ -266,14 +256,12 @@ def _process_uri(self) -> None:
         self._uri = parsed_uri._replace(params="").geturl()
 
     def _format_uri_with_shard_index(self) -> None:
-        if self._shard_idx is None:
-            return
-
         sharded_uri = self._uri.replace("%7Bshard_idx%7D", str(self._shard_idx))
-        if sharded_uri == self._uri:
-            raise AssetDownloadError(
-                f"`shard_idx` is specified, but the {self._display_name} is not sharded."
-            )
+        if self._shard_idx > 1:
+            if sharded_uri == self._uri:
+                raise AssetDownloadError(
+                    f"`shard_idx` is specified, but the {self._display_name} is not sharded."
+                )
 
         self._uri = sharded_uri
 
diff --git a/src/fairseq2/models/_handler.py b/src/fairseq2/models/_handler.py
@@ -12,7 +12,6 @@
 
 import torch
 from torch.nn import Module
-from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
 from typing_extensions import override
 
 from fairseq2.assets import (
@@ -264,51 +263,11 @@ def load_config(self, card: AssetCard) -> object:
     def create(
         self, config: object, gangs: Gangs, dtype: DataType, meta: bool
     ) -> Module:
-        if meta:
-            if not self.supports_meta:
-                raise NotSupportedError(
-                    f"The '{self._family}' model family does not support meta device initialization."
-                )
-
-            device = META
-        elif gangs.root.size != gangs.dp.size:
-            device = CPU  # Avoid OOM for sharded models.
-        else:
-            device = gangs.root.device
-
         config = structure(config, self._configs.config_kls)
 
         validate(config)
 
-        original_dtype = torch.get_default_dtype()
-
-        try:
-            torch.set_default_dtype(dtype)
-
-            with device:
-                model = self._factory(config)
-        except NotImplementedError as ex:
-            if "'Meta' backend" not in str(ex):
-                raise
-
-            raise ContractError(
-                "One or more operators in the model constructor have failed to initialize on the meta device. See the nested exception for details."
-            ) from ex
-        finally:
-            torch.set_default_dtype(original_dtype)
-
-        if gangs.root.size != gangs.dp.size:
-            if self._sharder is None:
-                raise NotSupportedError(
-                    f"The '{self._family}' model family does not support non-data parallelism."
-                )
-
-            self._sharder(model, config, gangs)
-
-            if not meta and device != gangs.root.device:
-                to_device(model, gangs.root.device)
-
-        return model
+        return self._do_create(config, gangs, dtype, meta)
 
     @override
     def load(
@@ -336,10 +295,8 @@ def load(
         except AssetCardError as ex:
             raise model_asset_card_error(model_name) from ex
 
-        shard_idx = gangs.tp.rank if num_shards > 1 else None
-
         path = self._asset_download_manager.download_checkpoint(
-            checkpoint_uri, model_name, shard_idx=shard_idx
+            checkpoint_uri, model_name, shard_idx=gangs.tp.rank
         )
 
         # Load the configuration.
@@ -394,6 +351,10 @@ def load_from_path(
                 "`gangs` must be on a real device, but is on the meta device instead."
             )
 
+        config = structure(config, self._configs.config_kls)
+
+        validate(config)
+
         if restrict is None:
             restrict = self._restrict
 
@@ -421,7 +382,7 @@ def load_from_path(
                 ) from ex
 
         # Create the model.
-        model = self.create(config, gangs, dtype, meta=self.supports_meta)
+        model = self._do_create(config, gangs, dtype, meta=self.supports_meta)
 
         if self.supports_meta:
             # Move the model to the actual device without initializing. Its
@@ -448,9 +409,6 @@ def load_from_path(
                 model_name, f"The model state dictionary in the '{model_name}' checkpoint is expected to be of type `dict`, but is of type `{type(state_dict)}` instead."  # fmt: skip
             )
 
-        # Remove DDP 'module' prefix.
-        consume_prefix_in_state_dict_if_present(state_dict, prefix="module.")
-
         try:
             load_state_dict(model, state_dict)
         except (KeyError, ValueError) as ex:
@@ -465,6 +423,51 @@ def load_from_path(
 
         return model
 
+    def _do_create(
+        self, config: object, gangs: Gangs, dtype: DataType, meta: bool
+    ) -> Module:
+        if meta:
+            if not self.supports_meta:
+                raise NotSupportedError(
+                    f"The '{self._family}' model family does not support meta device initialization."
+                )
+
+            device = META
+        elif gangs.root.size != gangs.dp.size:
+            device = CPU  # Avoid OOM for sharded models.
+        else:
+            device = gangs.root.device
+
+        original_dtype = torch.get_default_dtype()
+
+        try:
+            torch.set_default_dtype(dtype)
+
+            with device:
+                model = self._factory(config)
+        except NotImplementedError as ex:
+            if "'Meta' backend" not in str(ex):
+                raise
+
+            raise ContractError(
+                "One or more operators in the model constructor have failed to initialize on the meta device. See the nested exception for details."
+            ) from ex
+        finally:
+            torch.set_default_dtype(original_dtype)
+
+        if gangs.root.size != gangs.dp.size:
+            if self._sharder is None:
+                raise NotSupportedError(
+                    f"The '{self._family}' model family does not support non-data parallelism."
+                )
+
+            self._sharder(model, config, gangs)
+
+            if not meta and device != gangs.root.device:
+                to_device(model, gangs.root.device)
+
+        return model
+
     def compile(self, model: Module, config: object) -> Module:
         if self._torch_compiler is None:
             raise NotSupportedError(
diff --git a/src/fairseq2/recipes/common/_distributed.py b/src/fairseq2/recipes/common/_distributed.py
@@ -13,6 +13,7 @@
 from torch import Tensor
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.nn import Module
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from typing_extensions import override
@@ -113,7 +114,11 @@ def clip_gradient_norm(self, max_norm: float | None) -> Tensor:
 
     @override
     def state_dict(self) -> dict[str, object]:
-        return self._ddp.state_dict()
+        state_dict = self._ddp.state_dict()
+
+        consume_prefix_in_state_dict_if_present(state_dict, prefix="module.")
+
+        return state_dict
 
     @override
     def optim_state_dict(self, optim: Optimizer) -> dict[str, object]:
diff --git a/src/fairseq2/utils/structured.py b/src/fairseq2/utils/structured.py
@@ -169,9 +169,7 @@ def _structure_dataclass(
             )
 
         if isinstance(obj, kls):
-            values = {f.name: getattr(obj, f.name) for f in fields(kls)}
-
-            return self._create_dataclass(kls, values, set_empty)
+            return obj
 
         if isinstance(obj, Mapping):
             values = self.structure(obj, dict[str, object])

Original file line number	Diff line number	Diff line change
`@@ -169,9 +169,7 @@ def _structure_dataclass(`
`169`	`169`	`)`
`170`	`170`
`171`	`171`	`if isinstance(obj, kls):`
`172`		`- values = {f.name: getattr(obj, f.name) for f in fields(kls)}`
`173`		`-`
`174`		`- return self._create_dataclass(kls, values, set_empty)`
	`172`	`+ return obj`
`175`	`173`
`176`	`174`	`if isinstance(obj, Mapping):`
`177`	`175`	`values = self.structure(obj, dict[str, object])`