Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

config.json +3 -2
esm_nv.py +145 -74
special_tokens_map.json +42 -5
tokenizer.json +176 -0
tokenizer_config.json +8 -1

config.json CHANGED Viewed

@@ -8,7 +8,8 @@
   "auto_map": {
     "AutoConfig": "esm_nv.NVEsmConfig",
     "AutoModel": "esm_nv.NVEsmModel",
-    "AutoModelForMaskedLM": "esm_nv.NVEsmForMaskedLM"
   },
   "classifier_dropout": null,
   "dtype": "float32",
@@ -35,7 +36,7 @@
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,
-  "transformers_version": "4.57.0",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33

   "auto_map": {
     "AutoConfig": "esm_nv.NVEsmConfig",
     "AutoModel": "esm_nv.NVEsmModel",
+    "AutoModelForMaskedLM": "esm_nv.NVEsmForMaskedLM",
+    "AutoModelForTokenClassification": "esm_nv.NVEsmForTokenClassification"
   },
   "classifier_dropout": null,
   "dtype": "float32",
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,
+  "transformers_version": "4.57.3",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33

esm_nv.py CHANGED Viewed

@@ -23,7 +23,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
-from typing import Literal, Optional
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
@@ -36,15 +36,26 @@ from transformers.modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
     MaskedLMOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.esm.configuration_esm import EsmConfig
 from transformers.models.esm.modeling_esm import EsmPooler
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class NVEsmConfig(EsmConfig):
     """NVEsmConfig is a configuration for the NVEsm model."""
@@ -149,7 +160,9 @@ class NVEsmEncoder(nn.Module):
                 for i in range(config.num_hidden_layers)
             ]
         )
-        self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
@@ -157,27 +170,28 @@ class NVEsmEncoder(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        output_hidden_states: bool = False,
-        cu_seq_lens_q: torch.IntTensor | None = None,
-        cu_seq_lens_k: torch.IntTensor | None = None,
-        max_length_q: int | None = None,
-        max_length_k: int | None = None,
     ):
         """Forward pass of the NVEsmEncoder.
         Args:
             hidden_states (torch.Tensor): The hidden states.
             attention_mask (torch.Tensor): The attention mask.
-            output_hidden_states (bool): Whether to output the hidden states.
-            cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
-            cu_seq_lens_k (torch.IntTensor): The cumulative sequence lengths for the key state, if using THD inputs.
-            max_length_q (int): The maximum length for the query state, if using THD inputs.
-            max_length_k (int): The maximum length for the key state, if using THD inputs.
         """
         all_hidden_states: tuple[torch.Tensor, ...] = ()
         if self.config.attn_input_format == "thd":
-            if any(x is None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
                 raise ValueError(
                     "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k must be provided when using THD inputs."
                 )
@@ -187,11 +201,10 @@ class NVEsmEncoder(nn.Module):
             hidden_states = hidden_states.squeeze(0)
             attention_mask = None
-        elif self.config.attn_input_format == "bshd":
-            if any(x is not None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
-                raise ValueError(
-                    "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k are not allowed when using BSHD inputs."
-                )
         # Ensure that rotary embeddings are computed with at a higher precision outside the torch autocast context.
         with torch.autocast(device_type="cuda", enabled=False):
@@ -199,26 +212,33 @@ class NVEsmEncoder(nn.Module):
                 if self.config.attn_input_format == "bshd":
                     te_rope_emb = self.rotary_embeddings(max_seq_len=hidden_states.shape[1])
                 elif self.config.attn_input_format == "thd":
-                    te_rope_emb = self.rotary_embeddings(max_seq_len=cu_seq_lens_q[-1])
-            te_rope_emb = te_rope_emb.to(hidden_states.device, dtype=hidden_states.dtype, non_blocking=True)
         for layer_module in self.layers:
-            if output_hidden_states:
                 all_hidden_states = (*all_hidden_states, hidden_states)
             hidden_states = layer_module(
                 hidden_states,
                 attention_mask,
                 rotary_pos_emb=te_rope_emb,
-                cu_seqlens_q=cu_seq_lens_q,
-                cu_seqlens_kv=cu_seq_lens_k,
-                max_seqlen_q=max_length_q,
-                max_seqlen_kv=max_length_k,
             )
         hidden_states = self.emb_layer_norm_after(hidden_states)
-        if output_hidden_states:
             all_hidden_states = (*all_hidden_states, hidden_states)
         return BaseModelOutput(
@@ -233,6 +253,7 @@ class NVEsmPreTrainedModel(PreTrainedModel):
     config_class = NVEsmConfig
     base_model_prefix = "esm"
     supports_gradient_checkpointing = False
     _no_split_modules = (
         "TransformerLayer",
         "EsmEmbeddings",
@@ -265,6 +286,11 @@ class NVEsmPreTrainedModel(PreTrainedModel):
             if module.layer_norm_bias is not None:
                 module.layer_norm_bias.data.zero_()
 class NVEsmModel(NVEsmPreTrainedModel):
     """The ESM Encoder-only protein language model.
@@ -310,11 +336,7 @@ class NVEsmModel(NVEsmPreTrainedModel):
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
-        cu_seq_lens_q: torch.IntTensor | None = None,
-        cu_seq_lens_k: torch.IntTensor | None = None,
-        max_length_q: int | None = None,
-        max_length_k: int | None = None,
     ) -> BaseModelOutputWithPooling:
         """Forward pass of the NVEsmModel.
@@ -323,19 +345,11 @@ class NVEsmModel(NVEsmPreTrainedModel):
             attention_mask (torch.Tensor): The attention mask.
             position_ids (torch.Tensor): The position ids.
             inputs_embeds (torch.Tensor): The input embeddings.
-            output_hidden_states (bool): Whether to output the hidden states.
-            cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
-            cu_seq_lens_k (torch.IntTensor): The cumulative sequence lengths for the key state, if using THD inputs.
-            max_length_q (int): The maximum length for the query state, if using THD inputs.
-            max_length_k (int): The maximum length for the key state, if using THD inputs.
         Returns:
             BaseModelOutputWithPooling: The output of the model.
         """
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -363,19 +377,12 @@ class NVEsmModel(NVEsmPreTrainedModel):
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
-            cu_seq_lens_q=cu_seq_lens_q,
-            cu_seq_lens_k=cu_seq_lens_k,
-            max_length_q=max_length_q,
-            max_length_k=max_length_k,
         )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,
-            output_hidden_states=output_hidden_states,
-            cu_seq_lens_q=cu_seq_lens_q,
-            cu_seq_lens_k=cu_seq_lens_k,
-            max_length_q=max_length_q,
-            max_length_k=max_length_k,
         )
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
@@ -427,11 +434,7 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
-        output_hidden_states: Optional[bool] = None,
-        cu_seq_lens_q: torch.IntTensor | None = None,
-        cu_seq_lens_k: torch.IntTensor | None = None,
-        max_length_q: int | None = None,
-        max_length_k: int | None = None,
     ) -> MaskedLMOutput:
         """Forward pass of the NVEsmForMaskedLM.
@@ -441,11 +444,7 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
             position_ids (torch.LongTensor): The position ids.
             inputs_embeds (torch.FloatTensor): The input embeddings.
             labels (torch.LongTensor): The labels.
-            output_hidden_states (bool): Whether to output the hidden states.
-            cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
-            cu_seq_lens_k (torch.IntTensor): The cumulative sequence lengths for the key state, if using THD inputs.
-            max_length_q (int): The maximum length for the query state, if using THD inputs.
-            max_length_k (int): The maximum length for the key state, if using THD inputs.
         Returns:
             MaskedLMOutput: The output of the model.
@@ -455,11 +454,7 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
             attention_mask=attention_mask,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
-            output_hidden_states=output_hidden_states,
-            cu_seq_lens_q=cu_seq_lens_q,
-            cu_seq_lens_k=cu_seq_lens_k,
-            max_length_q=max_length_q,
-            max_length_k=max_length_k,
         )
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
@@ -493,13 +488,18 @@ class NVEsmLMHead(nn.Module):
             config (NVEsmConfig): The configuration of the model.
         """
         super().__init__()
-        self.dense = transformer_engine.pytorch.Linear(config.hidden_size, config.hidden_size)
         self.decoder = transformer_engine.pytorch.LayerNormLinear(
             config.hidden_size,
             config.padded_vocab_size if config.padded_vocab_size is not None else config.vocab_size,
             bias=True,
             eps=config.layer_norm_eps,
         )
     def forward(self, features, **kwargs):
@@ -522,11 +522,16 @@ class NVEsmEmbeddings(nn.Module):
         """Initialize a NVEsmEmbeddings."""
         super().__init__()
         self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size, config.hidden_size, padding_idx=config.pad_token_id
         )
         self.layer_norm = (
-            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.emb_layer_norm_before else None
         )
         if config.position_embedding_type != "rotary":
@@ -544,10 +549,7 @@ class NVEsmEmbeddings(nn.Module):
         input_ids=None,
         attention_mask=None,
         inputs_embeds=None,
-        cu_seq_lens_q: torch.IntTensor | None = None,
-        cu_seq_lens_k: torch.IntTensor | None = None,
-        max_length_q: int | None = None,
-        max_length_k: int | None = None,
     ):
         """Forward pass of the NVEsmEmbeddings."""
         if inputs_embeds is None:
@@ -557,7 +559,12 @@ class NVEsmEmbeddings(nn.Module):
         # embedding_scale factor here.
         embeddings = inputs_embeds
-        if all(x is not None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
             using_thd = True
             attention_mask = None
         else:
@@ -583,10 +590,12 @@ class NVEsmEmbeddings(nn.Module):
                 embeddings = (embeddings * scale_factor[:, None, None]).to(embeddings.dtype)
             else:
-                src_lengths = torch.diff(cu_seq_lens_q)
                 # We need to find the number of masked tokens in each sequence in the padded batch.
                 is_masked = (input_ids == self.mask_token_id).squeeze(0)
-                n_masked_per_seq = torch.nested.nested_tensor_from_jagged(is_masked, offsets=cu_seq_lens_q).sum(1)
                 mask_ratio_observed = n_masked_per_seq.float() / src_lengths
                 scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
                 reshaped_scale_factor = torch.repeat_interleave(scale_factor, src_lengths, dim=0)
@@ -599,3 +608,65 @@ class NVEsmEmbeddings(nn.Module):
             embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)
         return embeddings

 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
+from typing import Literal, Optional, Unpack
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
     BaseModelOutput,
     BaseModelOutputWithPooling,
     MaskedLMOutput,
+    TokenClassifierOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.esm.configuration_esm import EsmConfig
 from transformers.models.esm.modeling_esm import EsmPooler
 from transformers.utils import logging
+from transformers.utils.generic import TransformersKwargs
 logger = logging.get_logger(__name__)
+# Dictionary that gets inserted into config.json to map Auto** classes to our TE-optimized model classes defined below.
+# These should be prefixed with esm_nv., since we name the file esm_nv.py in our exported checkpoints.
+AUTO_MAP = {
+    "AutoConfig": "esm_nv.NVEsmConfig",
+    "AutoModel": "esm_nv.NVEsmModel",
+    "AutoModelForMaskedLM": "esm_nv.NVEsmForMaskedLM",
+    "AutoModelForTokenClassification": "esm_nv.NVEsmForTokenClassification",
+}
 class NVEsmConfig(EsmConfig):
     """NVEsmConfig is a configuration for the NVEsm model."""
                 for i in range(config.num_hidden_layers)
             ]
         )
+        self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps, params_dtype=config.dtype
+        )
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
     ):
         """Forward pass of the NVEsmEncoder.
         Args:
             hidden_states (torch.Tensor): The hidden states.
             attention_mask (torch.Tensor): The attention mask.
+            **kwargs: Additional arguments, see TransformersKwargs for more details.
         """
         all_hidden_states: tuple[torch.Tensor, ...] = ()
+        has_thd_input = [
+            x is not None
+            for x in [
+                kwargs.get("cu_seq_lens_q", None),
+                kwargs.get("cu_seq_lens_k", None),
+                kwargs.get("max_length_q", None),
+                kwargs.get("max_length_k", None),
+            ]
+        ]
         if self.config.attn_input_format == "thd":
+            if not all(has_thd_input):
                 raise ValueError(
                     "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k must be provided when using THD inputs."
                 )
             hidden_states = hidden_states.squeeze(0)
             attention_mask = None
+        elif self.config.attn_input_format == "bshd" and any(has_thd_input):
+            raise ValueError(
+                "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k are not allowed when using BSHD inputs."
+            )
         # Ensure that rotary embeddings are computed with at a higher precision outside the torch autocast context.
         with torch.autocast(device_type="cuda", enabled=False):
                 if self.config.attn_input_format == "bshd":
                     te_rope_emb = self.rotary_embeddings(max_seq_len=hidden_states.shape[1])
                 elif self.config.attn_input_format == "thd":
+                    te_rope_emb = self.rotary_embeddings(
+                        max_seq_len=kwargs["cu_seq_lens_q_padded"][-1]
+                        if "cu_seq_lens_q_padded" in kwargs
+                        else kwargs["cu_seq_lens_q"][-1]
+                    )
+            te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
         for layer_module in self.layers:
+            if kwargs.get("output_hidden_states", False):
                 all_hidden_states = (*all_hidden_states, hidden_states)
             hidden_states = layer_module(
                 hidden_states,
                 attention_mask,
                 rotary_pos_emb=te_rope_emb,
+                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                max_seqlen_q=kwargs.get("max_length_q", None),
+                max_seqlen_kv=kwargs.get("max_length_k", None),
+                pad_between_seqs=kwargs.get("pad_between_seqs", None),
             )
         hidden_states = self.emb_layer_norm_after(hidden_states)
+        if kwargs.get("output_hidden_states", False):
             all_hidden_states = (*all_hidden_states, hidden_states)
         return BaseModelOutput(
     config_class = NVEsmConfig
     base_model_prefix = "esm"
     supports_gradient_checkpointing = False
+    accepts_loss_kwargs = False
     _no_split_modules = (
         "TransformerLayer",
         "EsmEmbeddings",
             if module.layer_norm_bias is not None:
                 module.layer_norm_bias.data.zero_()
+    @classmethod
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
+        """Override the default get_init_context method to allow for fp8 model initialization."""
+        return []
 class NVEsmModel(NVEsmPreTrainedModel):
     """The ESM Encoder-only protein language model.
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPooling:
         """Forward pass of the NVEsmModel.
             attention_mask (torch.Tensor): The attention mask.
             position_ids (torch.Tensor): The position ids.
             inputs_embeds (torch.Tensor): The input embeddings.
+            **kwargs: Additional arguments, see TransformersKwargs for more details.
         Returns:
             BaseModelOutputWithPooling: The output of the model.
         """
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
+            **kwargs,
         )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,
+            **kwargs,
         )
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> MaskedLMOutput:
         """Forward pass of the NVEsmForMaskedLM.
             position_ids (torch.LongTensor): The position ids.
             inputs_embeds (torch.FloatTensor): The input embeddings.
             labels (torch.LongTensor): The labels.
+            **kwargs: Additional arguments, see TransformersKwargs for more details.
         Returns:
             MaskedLMOutput: The output of the model.
             attention_mask=attention_mask,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
+            **kwargs,
         )
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
             config (NVEsmConfig): The configuration of the model.
         """
         super().__init__()
+        self.dense = transformer_engine.pytorch.Linear(
+            config.hidden_size,
+            config.hidden_size,
+            params_dtype=config.dtype,
+        )
         self.decoder = transformer_engine.pytorch.LayerNormLinear(
             config.hidden_size,
             config.padded_vocab_size if config.padded_vocab_size is not None else config.vocab_size,
             bias=True,
             eps=config.layer_norm_eps,
+            params_dtype=config.dtype,
         )
     def forward(self, features, **kwargs):
         """Initialize a NVEsmEmbeddings."""
         super().__init__()
         self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id,
+            dtype=config.dtype,
         )
         self.layer_norm = (
+            transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            if config.emb_layer_norm_before
+            else None
         )
         if config.position_embedding_type != "rotary":
         input_ids=None,
         attention_mask=None,
         inputs_embeds=None,
+        **kwargs: Unpack[TransformersKwargs],
     ):
         """Forward pass of the NVEsmEmbeddings."""
         if inputs_embeds is None:
         # embedding_scale factor here.
         embeddings = inputs_embeds
+        if (
+            kwargs.get("cu_seq_lens_q") is not None
+            and kwargs.get("cu_seq_lens_k") is not None
+            and kwargs.get("max_length_q") is not None
+            and kwargs.get("max_length_k") is not None
+        ):
             using_thd = True
             attention_mask = None
         else:
                 embeddings = (embeddings * scale_factor[:, None, None]).to(embeddings.dtype)
             else:
+                src_lengths = torch.diff(kwargs["cu_seq_lens_q"])
                 # We need to find the number of masked tokens in each sequence in the padded batch.
                 is_masked = (input_ids == self.mask_token_id).squeeze(0)
+                n_masked_per_seq = torch.nested.nested_tensor_from_jagged(
+                    is_masked, offsets=kwargs["cu_seq_lens_q"]
+                ).sum(1)
                 mask_ratio_observed = n_masked_per_seq.float() / src_lengths
                 scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
                 reshaped_scale_factor = torch.repeat_interleave(scale_factor, src_lengths, dim=0)
             embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)
         return embeddings
+class NVEsmForTokenClassification(NVEsmPreTrainedModel):
+    """Adds a token classification head to the model.
+    Adapted from EsmForTokenClassification in Hugging Face Transformers `modeling_esm.py`.
+    """
+    def __init__(self, config):
+        """Initialize NVEsmForTokenClassification."""
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.esm = NVEsmModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = transformer_engine.pytorch.Linear(
+            config.hidden_size, config.num_labels, params_dtype=config.dtype
+        )
+        self.init_weights()
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> TokenClassifierOutput:
+        """Forward pass for the token classification head.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,44 @@
 {
-  "cls_token": "<cls>",
-  "eos_token": "<eos>",
-  "mask_token": "<mask>",
-  "pad_token": "<pad>",
-  "unk_token": "<unk>"
 }

 {
+  "bos_token": {
+    "content": "<cls>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<cls>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

tokenizer.json ADDED Viewed

	@@ -0,0 +1,176 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<cls>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32,
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Split",
+    "pattern": {
+      "String": ""
+    },
+    "behavior": "Isolated",
+    "invert": false
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<cls>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "<cls>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<cls>": {
+        "id": "<cls>",
+        "ids": [
+          0
+        ],
+        "tokens": [
+          "<cls>"
+        ]
+      },
+      "<eos>": {
+        "id": "<eos>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<eos>"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<cls>": 0,
+      "<pad>": 1,
+      "<eos>": 2,
+      "<unk>": 3,
+      "L": 4,
+      "A": 5,
+      "G": 6,
+      "V": 7,
+      "S": 8,
+      "E": 9,
+      "R": 10,
+      "T": 11,
+      "I": 12,
+      "D": 13,
+      "P": 14,
+      "K": 15,
+      "Q": 16,
+      "N": 17,
+      "F": 18,
+      "Y": 19,
+      "M": 20,
+      "H": 21,
+      "W": 22,
+      "C": 23,
+      "X": 24,
+      "B": 25,
+      "U": 26,
+      "Z": 27,
+      "O": 28,
+      ".": 29,
+      "-": 30,
+      "<null_1>": 31,
+      "<mask>": 32
+    },
+    "unk_token": "<unk>"
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
   "added_tokens_decoder": {
     "0": {
       "content": "<cls>",
@@ -41,13 +43,18 @@
       "special": true
     }
   },
   "clean_up_tokenization_spaces": false,
   "cls_token": "<cls>",
   "eos_token": "<eos>",
   "extra_special_tokens": {},
   "mask_token": "<mask>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
-  "tokenizer_class": "EsmTokenizer",
   "unk_token": "<unk>"
 }

 {
+  "add_bos_token": true,
+  "add_eos_token": true,
   "added_tokens_decoder": {
     "0": {
       "content": "<cls>",
       "special": true
     }
   },
+  "bos_token": "<cls>",
   "clean_up_tokenization_spaces": false,
   "cls_token": "<cls>",
   "eos_token": "<eos>",
   "extra_special_tokens": {},
   "mask_token": "<mask>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "<unk>"
 }