mratsim commited on
Commit
3c9a12b
·
verified ·
1 Parent(s): 34f3f23

Create calibrate_software_engineer.yaml

Browse files
Files changed (1) hide show
  1. calibrate_software_engineer.yaml +416 -0
calibrate_software_engineer.yaml ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ calibration_set:
2
+ _templates:
3
+ programming_languages: &programming_languages "Solve the following problem using {{ ['Zephyr', 'Prolog', 'Cobol', 'Apex', 'Crystal', 'Fortran', 'Nim', 'Delphi', 'Ada', 'Objective-C', 'VBA', 'Perl', 'Groovy', 'MATLAB', 'Solidity', 'Visual Basic', 'OCaml', 'Erlang', 'Julia', 'Lisp', 'F#', 'Clojure', 'GDScript', 'Scala', 'R', 'Haskell', 'Ruby', 'Elixir', 'Lua', 'Zig', 'Dart', 'Swift', 'Metal', 'PowerShell', 'PHP', 'Kotlin', 'C', 'Java', 'C++', 'C#', 'Bash/Shell', 'Go', 'Rust', 'TypeScript', 'HTML/CSS', 'SQL', 'JavaScript', 'Python', 'Lean', 'Coq', 'Pony', 'D', 'Racket', 'Haxe', 'x86-64 ASM', 'ARM-64 ASM', 'LLVM IR', 'GLSL', 'CUDA', 'Vulkan'][hash(row|string) % 60] }}\n***\n"
4
+ spoken_languages: &spoken_languages "Answer in {{ ['Arabic', 'Chinese', 'French', 'German', 'Hebrew', 'Hindi', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish', 'Turkish'][hash(row|string) % 12] }}\n***\n"
5
+ max_seq_length: 8192
6
+ shuffle: true
7
+ seed: 42
8
+ datasets:
9
+
10
+ # Category Summary (Total: 590 samples)
11
+ # =====================================================
12
+ # General chat (24 samples - 4.07%)
13
+ # Instruction and Reasoning tuning (14 samples - 2.37%)
14
+ # Multilingual (36 samples - 6.10%)
15
+ # Tool use (100 samples - 16.95%)
16
+ # Code / Programming / Software Engineering / Devops (328 samples - 55.59%)
17
+ # Math (12 samples - 2.03%)
18
+ # Sciences (16 samples - 2.71%)
19
+ # Medical (8 samples - 1.36%)
20
+ # Finance (8 samples - 1.36%)
21
+ # Business (16 samples - 2.71%)
22
+ # Humanities and Philosophy (8 samples - 1.36%)
23
+ # Creative Writing, Adventure, Roleplay (13 samples - 2.20%)
24
+ # General Knowledge and Pop Culture (2 samples - 0.34%)
25
+ # Specialized skills (4 samples - 0.68%)
26
+ # Misc (1 sample - 0.17%)
27
+ # =====================================================
28
+
29
+ # Research
30
+ # =====================================================
31
+ # According to this presentation https://minjiazhang.github.io/courses/fall24-resource/slides/awq.pdf
32
+ # AWQ only needs 64 samples to identify salient weights that need to be preserved.
33
+ #
34
+ # This research predates the boom of MoE (Mixture-of-Experts) models
35
+ # and it's safer to assume that 64 samples of a general dataset
36
+ # cannot properly identify salient weights of experts.
37
+
38
+ # General chat (24 samples)
39
+ # ---------------------------------------------------------------------------
40
+ - dataset: HuggingFaceH4/ultrachat_200k
41
+ columns: [messages]
42
+ split: train_sft
43
+ formatter: chat_completion
44
+ num_samples: 8
45
+ streaming: true
46
+
47
+ - dataset: databricks/databricks-dolly-15k
48
+ split: train
49
+ columns: [instruction, response]
50
+ formatter: prompt_answer
51
+ num_samples: 8
52
+
53
+ - dataset: neuralmagic/calibration
54
+ subset: LLM
55
+ split: train
56
+ columns: [messages]
57
+ formatter: chat_completion
58
+ num_samples: 8
59
+
60
+ # Instruction and Reasoning tuning (14 samples)
61
+ # ---------------------------------------------------------------------------
62
+ - dataset: HuggingFaceH4/no_robots
63
+ split: train
64
+ columns: [messages]
65
+ formatter: chat_completion
66
+ num_samples: 2
67
+
68
+ - dataset: nvidia/HelpSteer
69
+ split: train
70
+ columns: [prompt, response]
71
+ formatter: prompt_answer
72
+ num_samples: 2
73
+ streaming: true
74
+
75
+ - dataset: garage-bAInd/Open-Platypus
76
+ split: train
77
+ columns: [instruction, output]
78
+ formatter: prompt_answer
79
+ num_samples: 2
80
+
81
+ - dataset: PJMixers/grimulkan_physical-reasoning-ShareGPT
82
+ split: train
83
+ columns: [conversations]
84
+ formatter: sharegpt
85
+ num_samples: 4
86
+
87
+ - dataset: PJMixers/grimulkan_theory-of-mind-ShareGPT
88
+ split: train
89
+ columns: [conversations]
90
+ formatter: sharegpt
91
+ num_samples: 4
92
+
93
+ # Multilingual (36 samples)
94
+ # ---------------------------------------------------------------------------
95
+ - dataset: HuggingFaceH4/Multilingual-Thinking
96
+ split: train
97
+ columns: [user]
98
+ formatter: raw_text
99
+ num_samples: 32
100
+ formatter_params:
101
+ prefix: *spoken_languages
102
+
103
+ - dataset: ServiceNow-AI/M2Lingual
104
+ subset: full_data
105
+ split: train
106
+ columns: [conversation]
107
+ formatter: chat_completion
108
+ num_samples: 4
109
+ streaming: true
110
+
111
+ # Tool use (include commented out ToolAce) (100 samples)
112
+ # ---------------------------------------------------------------------------
113
+
114
+ # Fail with minimax!
115
+ # jinja2.exceptions.TemplateError: Message has tool role, but there was no previous assistant message with a tool call!
116
+ # - dataset: Team-ACE/ToolACE
117
+ # split: train
118
+ # columns: [system, conversations]
119
+ # formatter: chat_completion_with_sysprompt
120
+ # num_samples: 100
121
+
122
+ - dataset: interstellarninja/hermes_reasoning_tool_use
123
+ split: train
124
+ columns: [conversations]
125
+ formatter: sharegpt
126
+ num_samples: 100
127
+ streaming: true
128
+
129
+ # Code / Programming / Software Engineering / Devops (336 samples)
130
+ # ---------------------------------------------------------------------------
131
+
132
+ - dataset: deepmind/code_contests
133
+ split: train
134
+ columns: [name]
135
+ formatter: deepmind_code_contests
136
+ num_samples: 50
137
+ streaming: true
138
+
139
+ - dataset: dh02391735/stackoverflow-kubernetes-questions
140
+ split: train
141
+ columns: [instruction]
142
+ formatter: raw_text
143
+ num_samples: 8
144
+ streaming: true
145
+
146
+ - dataset: diversoailab/humaneval-rust
147
+ split: train
148
+ columns: [prompt]
149
+ formatter: raw_text
150
+ num_samples: 100
151
+ formatter_params: # The dataset actually doesn't hardcode the language
152
+ prefix: *programming_languages
153
+
154
+ - dataset: ammarnasr/the-stack-rust-clean
155
+ split: train
156
+ columns: [content]
157
+ formatter: raw_text
158
+ num_samples: 8
159
+ streaming: true
160
+ formatter_params:
161
+ prefix: "Explain this code and comment it for a junior dev.\n***\n"
162
+
163
+ - dataset: CSJianYang/CodeArena
164
+ split: test
165
+ columns: [messages]
166
+ formatter: chat_completion
167
+ num_samples: 8
168
+
169
+ - dataset: nvidia/OpenCodeInstruct
170
+ split: train
171
+ columns: [input, output]
172
+ formatter: prompt_answer
173
+ num_samples: 8
174
+ streaming: true
175
+
176
+ - dataset: nvidia/Llama-Nemotron-Post-Training-Dataset
177
+ split: code
178
+ columns: [input]
179
+ formatter: chat_completion
180
+ num_samples: 8
181
+ streaming: true
182
+
183
+ - dataset: nvidia/Nemotron-Competitive-Programming-v1
184
+ split: competitive_coding_cpp_part00
185
+ columns: [messages]
186
+ formatter: chat_completion
187
+ num_samples: 8
188
+ streaming: true
189
+
190
+ # The conversations columns has another "conversations" field :/
191
+ # - dataset: sr5434/CodegebraGPT_data
192
+ # subset: 100k-text
193
+ # split: train
194
+ # columns: [conversations]
195
+ # formatter: sharegpt
196
+ # num_samples: 8
197
+
198
+ - dataset: rombodawg/code_bagel_hermes-2.5
199
+ split: train
200
+ columns: [input, output]
201
+ formatter: prompt_answer
202
+ num_samples: 100
203
+ streaming: true
204
+
205
+ - dataset: MathArena/project_euler
206
+ split: train
207
+ columns: [problem]
208
+ formatter: raw_text
209
+ num_samples: 30
210
+ formatter_params:
211
+ prefix: *programming_languages
212
+
213
+ # Math (12 samples)
214
+ - dataset: nvidia/Llama-Nemotron-Post-Training-Dataset
215
+ split: math
216
+ columns: [input]
217
+ formatter: chat_completion
218
+ num_samples: 4
219
+ streaming: true
220
+
221
+ - dataset: nvidia/Nemotron-Math-Proofs-v1
222
+ split: lean
223
+ columns: [formal_statement]
224
+ formatter: raw_text
225
+ num_samples: 4
226
+ streaming: true
227
+ formatter_params:
228
+ prefix: "Can you improve, document and add comment to this Lean proof for a non-mathematician?\n***\n"
229
+
230
+ - dataset: nvidia/OpenMathInstruct-2
231
+ split: train
232
+ columns: [problem, generated_solution]
233
+ formatter: prompt_answer
234
+ num_samples: 4
235
+ streaming: true
236
+
237
+ # Sciences (16 samples)
238
+ - dataset: nvidia/Llama-Nemotron-Post-Training-Dataset
239
+ split: science
240
+ columns: [input]
241
+ formatter: chat_completion
242
+ num_samples: 4
243
+ streaming: true
244
+
245
+ - dataset: nvidia/OpenScienceReasoning-2
246
+ split: train
247
+ columns: [input, output]
248
+ formatter: prompt_answer
249
+ num_samples: 8
250
+ streaming: true
251
+
252
+ - dataset: MegaScience/MegaScience
253
+ split: train
254
+ columns: [question, answer]
255
+ formatter: prompt_answer
256
+ num_samples: 4
257
+ streaming: true
258
+
259
+ # Medical (8 samples)
260
+ - dataset: OpenMed/Medical-Reasoning-SFT-GPT-OSS-120B
261
+ split: train
262
+ columns: [messages]
263
+ formatter: chat_completion
264
+ num_samples: 4
265
+ streaming: true
266
+
267
+ - dataset: ccdv/pubmed-summarization
268
+ subset: section
269
+ split: train
270
+ columns: [article]
271
+ formatter: raw_text
272
+ num_samples: 4
273
+ streaming: true
274
+ formatter_params:
275
+ prefix: "Summarize this:\n***\n"
276
+
277
+ # Finance (8 samples)
278
+ - dataset: gbharti/finance-alpaca
279
+ split: train
280
+ columns: [instruction, output]
281
+ formatter: prompt_answer
282
+ num_samples: 4
283
+
284
+ - dataset: vladlen32230/summarization-yahoo-stock-finance-article-text
285
+ split: train
286
+ columns: [text]
287
+ formatter: raw_text
288
+ num_samples: 4
289
+ formatter_params:
290
+ prefix: "Summarize this:\n***\n"
291
+
292
+ # Business (16 samples)
293
+ - dataset: fka/awesome-chatgpt-prompts
294
+ split: train
295
+ columns: [prompt]
296
+ formatter: raw_text
297
+ num_samples: 8
298
+
299
+ - dataset: theoldmandthesea/17k_business_book
300
+ split: train
301
+ columns: [question, answer]
302
+ formatter: prompt_answer
303
+ num_samples: 8
304
+
305
+ # Humanities and Philosophy (8 samples)
306
+ - dataset: ruggsea/stanford-encyclopedia-of-philosophy_instruct
307
+ split: train
308
+ columns: [question, answer]
309
+ formatter: prompt_answer
310
+ num_samples: 2
311
+ streaming: true
312
+
313
+ - dataset: mlfoundations-dev/stackexchange_philosophy
314
+ split: train
315
+ columns: [conversations]
316
+ formatter: sharegpt
317
+ num_samples: 2
318
+
319
+ - dataset: FreedomIntelligence/SocraticChat
320
+ split: train
321
+ columns: [conversations]
322
+ formatter: sharegpt
323
+ num_samples: 4
324
+ streaming: true
325
+
326
+ # Creative Writing, Adventure, Roleplay (13 samples)
327
+ - dataset: Gryphe/Opus-WritingPrompts
328
+ split: train
329
+ columns: [conversations]
330
+ formatter: sharegpt
331
+ num_samples: 2
332
+
333
+ - dataset: anthracite-org/nopm_claude_writing_fixed
334
+ split: train
335
+ columns: [conversations]
336
+ formatter: sharegpt
337
+ num_samples: 2
338
+
339
+ - dataset: zerofata/Roleplay-Anime-Characters
340
+ split: train
341
+ columns: [messages]
342
+ formatter: chat_completion
343
+ num_samples: 1
344
+
345
+ - dataset: zerofata/Instruct-Anime
346
+ split: train
347
+ columns: [messages]
348
+ formatter: chat_completion
349
+ num_samples: 1
350
+
351
+ - dataset: zerofata/Instruct-Anime-CreativeWriting
352
+ split: train
353
+ columns: [messages]
354
+ formatter: chat_completion
355
+ num_samples: 1
356
+
357
+ - dataset: sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo
358
+ split: train
359
+ columns: [chosen]
360
+ formatter: chat_completion
361
+ num_samples: 2
362
+
363
+ - dataset: PocketDoc/Dans-Prosemaxx-Adventure
364
+ split: train
365
+ columns: [conversations]
366
+ formatter: sharegpt
367
+ num_samples: 2
368
+
369
+ - dataset: anthracite-org/stheno-filtered-v1.1
370
+ split: train
371
+ columns: [conversations]
372
+ formatter: sharegpt
373
+ num_samples: 2
374
+ streaming: true
375
+
376
+ # General Knowledge and Pop Culture (2 samples)
377
+ - dataset: KaraKaraWitch/TvTroper-2025
378
+ split: train
379
+ columns: [article]
380
+ formatter: raw_text
381
+ num_samples: 2
382
+ streaming: true
383
+ formatter_params:
384
+ prefix: "Explain this trope like I'm your grandmother\n***\n"
385
+
386
+ # Behavioral skills (8 samples)
387
+ - dataset: AquaV/US-Army-Survival-Sharegpt
388
+ split: train
389
+ columns: [conversations]
390
+ formatter: sharegpt
391
+ num_samples: 1
392
+
393
+ - dataset: AquaV/Interrogation-Sharegpt
394
+ split: train
395
+ columns: [conversations]
396
+ formatter: sharegpt
397
+ num_samples: 1
398
+
399
+ - dataset: AquaV/Multi-Environment-Operations-Sharegpt
400
+ split: train
401
+ columns: [conversations]
402
+ formatter: sharegpt
403
+ num_samples: 1
404
+
405
+ - dataset: AquaV/Resistance-Sharegpt
406
+ split: train
407
+ columns: [conversations]
408
+ formatter: sharegpt
409
+ num_samples: 1
410
+
411
+ # Misc (1 sample)
412
+ - dataset: PocketDoc/Dans-Kinomaxx-VanillaBackrooms
413
+ split: train
414
+ columns: [conversations]
415
+ formatter: sharegpt
416
+ num_samples: 1