Hanzo Dev commited on
Commit
435c70a
·
1 Parent(s): 333f111

Add 8 more top datasets: Magicoder, AgentInstruct, ToolBench, OpenOrca, etc

Browse files
Files changed (2) hide show
  1. README.md +11 -0
  2. app.py +39 -0
README.md CHANGED
@@ -44,8 +44,19 @@ Train any Zen model with any dataset combination from HuggingFace. Everything ru
44
  **Function Calling:**
45
  - xLAM 60k (Salesforce high-quality function calling)
46
 
 
 
 
 
 
 
 
 
 
 
47
  **Instruction Tuning:**
48
  - Alpaca (52k instruction samples)
 
49
 
50
  ## 🚀 How to Use
51
 
 
44
  **Function Calling:**
45
  - xLAM 60k (Salesforce high-quality function calling)
46
 
47
+ **Coding:**
48
+ - Magicoder-OSS-Instruct (75k code samples)
49
+ - CodeFeedback-Filtered (157k code instructions)
50
+ - Evol-Instruct-Code (80k evolved code complexity)
51
+
52
+ **Advanced Agentic:**
53
+ - AgentInstruct (1M agent trajectories from Microsoft)
54
+ - ToolBench (16k tool use examples)
55
+ - WebArena (2k web navigation tasks)
56
+
57
  **Instruction Tuning:**
58
  - Alpaca (52k instruction samples)
59
+ - OpenOrca (4.2M reasoning-focused instructions)
60
 
61
  ## 🚀 How to Use
62
 
app.py CHANGED
@@ -114,12 +114,51 @@ DATASETS = {
114
  "size": "60k samples"
115
  },
116
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  "Instruction Tuning": {
118
  "Alpaca": {
119
  "hf_id": "tatsu-lab/alpaca",
120
  "config": None,
121
  "size": "52k samples"
122
  },
 
 
 
 
 
123
  }
124
  }
125
 
 
114
  "size": "60k samples"
115
  },
116
  },
117
+ "Coding Datasets": {
118
+ "Magicoder-OSS-Instruct": {
119
+ "hf_id": "ise-uiuc/Magicoder-OSS-Instruct-75K",
120
+ "config": None,
121
+ "size": "75k code samples"
122
+ },
123
+ "CodeFeedback-Filtered": {
124
+ "hf_id": "m-a-p/CodeFeedback-Filtered-Instruction",
125
+ "config": None,
126
+ "size": "157k code samples"
127
+ },
128
+ "Evol-Instruct-Code": {
129
+ "hf_id": "nickrosh/Evol-Instruct-Code-80k-v1",
130
+ "config": None,
131
+ "size": "80k evolved code"
132
+ },
133
+ },
134
+ "Advanced Agentic": {
135
+ "AgentInstruct": {
136
+ "hf_id": "microsoft/orca-agentinstruct-1M-v1",
137
+ "config": None,
138
+ "size": "1M agent samples"
139
+ },
140
+ "ToolBench": {
141
+ "hf_id": "ToolBench/ToolBench",
142
+ "config": None,
143
+ "size": "16k tool use"
144
+ },
145
+ "WebArena": {
146
+ "hf_id": "neulab/agent-data-collection",
147
+ "config": "nnetnav-wa",
148
+ "size": "~2k web agent"
149
+ },
150
+ },
151
  "Instruction Tuning": {
152
  "Alpaca": {
153
  "hf_id": "tatsu-lab/alpaca",
154
  "config": None,
155
  "size": "52k samples"
156
  },
157
+ "OpenOrca": {
158
+ "hf_id": "Open-Orca/OpenOrca",
159
+ "config": None,
160
+ "size": "4.2M reasoning"
161
+ },
162
  }
163
  }
164