Spaces:
Paused
Paused
Hanzo Dev
commited on
Commit
·
435c70a
1
Parent(s):
333f111
Add 8 more top datasets: Magicoder, AgentInstruct, ToolBench, OpenOrca, etc
Browse files
README.md
CHANGED
|
@@ -44,8 +44,19 @@ Train any Zen model with any dataset combination from HuggingFace. Everything ru
|
|
| 44 |
**Function Calling:**
|
| 45 |
- xLAM 60k (Salesforce high-quality function calling)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
**Instruction Tuning:**
|
| 48 |
- Alpaca (52k instruction samples)
|
|
|
|
| 49 |
|
| 50 |
## 🚀 How to Use
|
| 51 |
|
|
|
|
| 44 |
**Function Calling:**
|
| 45 |
- xLAM 60k (Salesforce high-quality function calling)
|
| 46 |
|
| 47 |
+
**Coding:**
|
| 48 |
+
- Magicoder-OSS-Instruct (75k code samples)
|
| 49 |
+
- CodeFeedback-Filtered (157k code instructions)
|
| 50 |
+
- Evol-Instruct-Code (80k evolved code complexity)
|
| 51 |
+
|
| 52 |
+
**Advanced Agentic:**
|
| 53 |
+
- AgentInstruct (1M agent trajectories from Microsoft)
|
| 54 |
+
- ToolBench (16k tool use examples)
|
| 55 |
+
- WebArena (2k web navigation tasks)
|
| 56 |
+
|
| 57 |
**Instruction Tuning:**
|
| 58 |
- Alpaca (52k instruction samples)
|
| 59 |
+
- OpenOrca (4.2M reasoning-focused instructions)
|
| 60 |
|
| 61 |
## 🚀 How to Use
|
| 62 |
|
app.py
CHANGED
|
@@ -114,12 +114,51 @@ DATASETS = {
|
|
| 114 |
"size": "60k samples"
|
| 115 |
},
|
| 116 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
"Instruction Tuning": {
|
| 118 |
"Alpaca": {
|
| 119 |
"hf_id": "tatsu-lab/alpaca",
|
| 120 |
"config": None,
|
| 121 |
"size": "52k samples"
|
| 122 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
}
|
| 124 |
}
|
| 125 |
|
|
|
|
| 114 |
"size": "60k samples"
|
| 115 |
},
|
| 116 |
},
|
| 117 |
+
"Coding Datasets": {
|
| 118 |
+
"Magicoder-OSS-Instruct": {
|
| 119 |
+
"hf_id": "ise-uiuc/Magicoder-OSS-Instruct-75K",
|
| 120 |
+
"config": None,
|
| 121 |
+
"size": "75k code samples"
|
| 122 |
+
},
|
| 123 |
+
"CodeFeedback-Filtered": {
|
| 124 |
+
"hf_id": "m-a-p/CodeFeedback-Filtered-Instruction",
|
| 125 |
+
"config": None,
|
| 126 |
+
"size": "157k code samples"
|
| 127 |
+
},
|
| 128 |
+
"Evol-Instruct-Code": {
|
| 129 |
+
"hf_id": "nickrosh/Evol-Instruct-Code-80k-v1",
|
| 130 |
+
"config": None,
|
| 131 |
+
"size": "80k evolved code"
|
| 132 |
+
},
|
| 133 |
+
},
|
| 134 |
+
"Advanced Agentic": {
|
| 135 |
+
"AgentInstruct": {
|
| 136 |
+
"hf_id": "microsoft/orca-agentinstruct-1M-v1",
|
| 137 |
+
"config": None,
|
| 138 |
+
"size": "1M agent samples"
|
| 139 |
+
},
|
| 140 |
+
"ToolBench": {
|
| 141 |
+
"hf_id": "ToolBench/ToolBench",
|
| 142 |
+
"config": None,
|
| 143 |
+
"size": "16k tool use"
|
| 144 |
+
},
|
| 145 |
+
"WebArena": {
|
| 146 |
+
"hf_id": "neulab/agent-data-collection",
|
| 147 |
+
"config": "nnetnav-wa",
|
| 148 |
+
"size": "~2k web agent"
|
| 149 |
+
},
|
| 150 |
+
},
|
| 151 |
"Instruction Tuning": {
|
| 152 |
"Alpaca": {
|
| 153 |
"hf_id": "tatsu-lab/alpaca",
|
| 154 |
"config": None,
|
| 155 |
"size": "52k samples"
|
| 156 |
},
|
| 157 |
+
"OpenOrca": {
|
| 158 |
+
"hf_id": "Open-Orca/OpenOrca",
|
| 159 |
+
"config": None,
|
| 160 |
+
"size": "4.2M reasoning"
|
| 161 |
+
},
|
| 162 |
}
|
| 163 |
}
|
| 164 |
|