jonsaadfalcon commited on
Commit
1a649e0
·
verified ·
1 Parent(s): 348de8e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +47 -113
README.md CHANGED
@@ -1,119 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ```
2
- import torch
3
- import torch.nn as nn
4
- from transformers import AutoTokenizer, AutoModel
5
- from huggingface_hub import hf_hub_download
6
 
7
- # Define the MLPHead class (same as in training)
8
- class MLPHead(nn.Module):
9
- def __init__(self, input_dim: int, hidden_dims: list, dropout_rate: float = 0.1):
10
- super().__init__()
11
- layers = []
12
- prev_dim = input_dim
13
-
14
- for hidden_dim in hidden_dims:
15
- layers.extend([
16
- nn.Linear(prev_dim, hidden_dim),
17
- nn.GELU(),
18
- nn.Dropout(dropout_rate)
19
- ])
20
- prev_dim = hidden_dim
21
-
22
- layers.append(nn.Linear(prev_dim, 1))
23
- self.mlp = nn.Sequential(*layers)
24
-
25
- def forward(self, x: torch.Tensor) -> torch.Tensor:
26
- return self.mlp(x)
27
 
28
- # Define the CustomCrossEncoder class
29
- class CustomCrossEncoder(nn.Module):
30
- def __init__(self, model_name="Alibaba-NLP/gte-Qwen2-1.5B-instruct"):
31
- super().__init__()
32
-
33
- # Load base model and tokenizer
34
- self.base_model = AutoModel.from_pretrained(
35
- model_name,
36
- trust_remote_code=True,
37
- torch_dtype=torch.bfloat16,
38
- device_map="auto"
39
- )
40
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
41
-
42
- if self.tokenizer.pad_token is None:
43
- self.tokenizer.pad_token = self.tokenizer.eos_token
44
-
45
- # Initialize MLP head with same architecture as training
46
- self.embedding_dim = 1536 # For Qwen2-1.5B
47
- self.mlp_head = MLPHead(
48
- input_dim=self.embedding_dim,
49
- hidden_dims=[1536, 768, 384], # Same as training
50
- dropout_rate=0.1
51
- ).to(torch.bfloat16)
52
-
53
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
54
- outputs = self.base_model(
55
- input_ids=input_ids,
56
- attention_mask=attention_mask,
57
- output_hidden_states=True
58
- )
59
-
60
- last_hidden_state = outputs.hidden_states[-1]
61
- cls_embedding = last_hidden_state[:, 0, :]
62
-
63
- output = self.mlp_head(cls_embedding)
64
- return output.squeeze(-1)
65
 
66
- # Load the model
67
- def load_cross_encoder(repo_id="hazyresearch/Weaver_Distilled_General_gte-Qwen2-1.5B-instruct"):
68
- # Initialize model
69
- model = CustomCrossEncoder()
70
-
71
- # Download checkpoint from HF hub
72
- checkpoint_path = hf_hub_download(
73
- repo_id=repo_id,
74
- filename="pytorch_model.pt",
75
- use_auth_token=True # Will use your HF token
76
- )
77
-
78
- # Load state dict
79
- state_dict = torch.load(checkpoint_path, map_location="cuda")
80
- model.load_state_dict(state_dict)
81
- model.eval()
82
-
83
- return model
84
 
85
- # Example usage
86
- def score_text_pair(model, text1, text2, max_length=8192):
87
- """Score a pair of texts using the cross-encoder."""
88
- # Tokenize
89
- encoded = model.tokenizer(
90
- text=text1,
91
- text_pair=text2,
92
- truncation=True,
93
- max_length=max_length,
94
- padding="max_length",
95
- return_tensors="pt"
96
- )
97
-
98
- # Move to device
99
- input_ids = encoded["input_ids"].cuda()
100
- attention_mask = encoded["attention_mask"].cuda()
101
-
102
- # Get score
103
- with torch.no_grad():
104
- score = model(input_ids, attention_mask)
105
-
106
- return score.item()
107
 
108
- # Usage example
109
- if __name__ == "__main__":
110
- # Load model
111
- model = load_cross_encoder()
112
-
113
- # Example: Score an instruction-response pair
114
- instruction = "What is the capital of France?"
115
- response = "The capital of France is Paris."
116
-
117
- score = score_text_pair(model, instruction, response)
118
- print(f"Score: {score:.4f}")
119
- ```
 
1
+ # Weaver Distilled - All Datasets (gte-Qwen2-1.5B-instruct)
2
+
3
+ This is a distilled cross-encoder model based on gte-Qwen2-1.5B-instruct, trained to predict the correctness of answers across multiple domains. This general-purpose verifier was trained on a combined dataset of 35 different verifiers and reward models aggregated using Weaver.
4
+
5
+ ## Model Details
6
+
7
+ - **Base Model**: [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)
8
+ - **Architecture**: Cross-encoder with MLP head (1536 → 768 → 384 → 1)
9
+ - **Max Sequence Length**: 4096
10
+ - **Training Data**: Combined dataset from 35 different LM Judges and reward models aggregated with Weaver
11
+ - **Training Objective**: Binary classification (correct/incorrect answer prediction)
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ from custom_crossencoder import CustomCrossEncoder, TrainingConfig
17
+
18
+ # Initialize model
19
+ config = TrainingConfig(
20
+ model_name="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
21
+ max_length=4096,
22
+ mlp_hidden_dims=[1536, 768, 384]
23
+ )
24
+ model = CustomCrossEncoder(config)
25
+
26
+ # Load checkpoint
27
+ model.load_state_dict(torch.load("path_to_checkpoint.pt"))
28
+ model.eval()
29
+
30
+ # Get prediction
31
+ instruction = "Your instruction here"
32
+ answer = "Your answer here"
33
+ encoded = model.tokenizer(
34
+ text=instruction,
35
+ text_pair=answer,
36
+ truncation=True,
37
+ max_length=4096,
38
+ padding="max_length",
39
+ return_tensors="pt"
40
+ )
41
+ with torch.no_grad():
42
+ prediction = model(encoded["input_ids"], encoded["attention_mask"])
43
  ```
 
 
 
 
44
 
45
+ ## License
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ [Your chosen license]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ ## Citation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ If you use this model in your research, please cite:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ TODO