-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference.py
executable file
·721 lines (620 loc) · 43.6 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
import collections
import openai
import os
import time
import threading
import json
import _thread
import random
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from contextlib import contextmanager
from collections import defaultdict
from openai import OpenAI
from metric_utils import extract_answer, extract_answers_for_model, extract_answers_for_gt
import copy
bbh_temp = [
{
'question': "Question: Find a movie similar to Interstellar, Monsters, Inc, Back to the Future, Inception:\nOptions:\n(A) My Fellow Americans\n(B) Shadow of the Vampire\n(C) Star Wars Episode VII - The Force Awakens\n(D) Psycho II\nAnswer: (C).\n\nSummary of learning as a numbered list:",
'rational': '''
1: Identify Key Characteristics: Look for central themes, genres, and elements in the movies.
2: Find Common Themes: Determine the shared themes and genres among the given movies.
3: Evaluate Options: Compare each option against these common themes and genres.
4: Select the Best Fit: Choose the movie that best matches the identified key elements.''',
'answer': '(C)'
},
{
'question': "Question: Select the humorous edit that 'ruins' the input movie or musical artist name. Which of the following is a humorous edit of this artist or movie name: 'radiohead'?\nOptions:\n(A) radio head\n(B) rawdiohead\n(C) radiodead\n(D) radiohesd\nAnswer: (C).\n\nSummary of learning as a numbered list:",
'rational': '''
1: Understand the Task: Ensure you know the goal is to find a humorous alteration.
2: Identify the Original Name: Clearly recognize the name you are altering.
3: Analyze Each Option: Consider how each option changes the original name and assess its humor.
4: Select the Most Humorous Edit: Choose the option that changes the meaning in a clever and funny way.''',
'answer': '(C)'
},
{
'question': "Question: Given a series of navigation instructions, determine whether one would end up back at the starting point. If you follow these instructions, do you return to the starting point? Take 3 steps. Take 6 steps. Take 8 steps. Take 5 steps. Take 6 steps. Take 2 steps. Take 1 step. Take 2 steps.\nOptions:\n- Yes\n- No\nAnswer: No.\n\nSummary of learning as a numbered list:",
'rational': '''
1.Understand the Task: Recognize that the goal is to determine if the instructions lead back to the starting point.
2.Analyze Each Step: Carefully read through each navigation instruction.
3.Track Movement: Keep a running tally or map of the steps taken, noting direction and distance.
4.Calculate Net Movement: Determine the overall movement by summing up the steps in each direction.
5.Evaluate Result: Compare the net movement to the starting point to see if they align.
6.Determine Outcome: Decide whether the total movements result in returning to the starting point or not.''',
'answer': 'No'
},
]
bbh_temp_full = [
{
'question': "Question: Find a movie similar to Interstellar, Monsters, Inc, Back to the Future, Inception:\nOptions:\n(A) My Fellow Americans\n(B) Shadow of the Vampire\n(C) Star Wars Episode VII - The Force Awakens\n(D) Psycho II\nAnswer: (C).\n\n",
'rational': '''
Summary of Learning:
1. Understand the Question:
- Recognize that the task is to find a movie similar to the given set of movies based on common themes or genres.
2. Identify Key Characteristics:
- Look for central themes, genres, and elements in the given movies: "Interstellar," "Monsters, Inc," "Back to the Future," and "Inception."
3. Find Common Themes:
- Determine the shared themes and genres among the given movies (e.g., science fiction, fantasy, adventure, time manipulation).
4. Evaluate Options:
- Compare each option against these common themes and genres to see which one aligns best.
5. Select the Best Fit:
- Choose the movie that best matches the identified key elements and common themes.
Supplementary Knowledge:
1. Background Information on Given Movies:
- Interstellar: A sci-fi film focused on space exploration, time manipulation, and the survival of humanity.
- Monsters, Inc.: An animated fantasy movie about monsters who generate energy by scaring children, featuring alternate dimensions.
- Back to the Future: A sci-fi film centered on time travel and the adventures that arise from altering the past and future.
- Inception: A sci-fi movie that explores the concept of entering and manipulating dreams to influence the real world.
2. Genre Definitions:
- Science Fiction (Sci-fi): Fiction based on imagined future scientific or technological advances and major social or environmental changes.
- Fantasy: A genre that uses magic and other supernatural elements as a primary plot element, theme, or setting.
- Adventure: A genre characterized by exciting and unusual experiences or journeys.
3. Analyzing Movie Elements:
- Look at the key elements of the plot, setting, and characters in movies to determine their genre and themes.
- Consider how movies make viewers feel (e.g., excited, thoughtful) and how this relates to the themes and genres.
4. Critical Thinking in Choices:
- Elimination Strategy: Discard options that clearly do not fit the identified common themes and genres.
- Comparison: Compare the remaining options based on a detailed understanding of the themes and genres discussed.
5. Example Analysis:
- My Fellow Americans: A political comedy, not fitting the sci-fi/fantasy/adventure themes.
- Shadow of the Vampire: A horror film, not fitting the sci-fi/fantasy/adventure themes.
- Star Wars Episode VII - The Force Awakens: A sci-fi/fantasy adventure film, fitting the themes of space exploration and adventure.
- Psycho II: A horror/thriller, not fitting the sci-fi/fantasy/adventure themes.
- Conclusion: Option (C) "Star Wars Episode VII - The Force Awakens" aligns best with the common themes and genres of the given movies.''',
'answer': '(C)'
},
{
'question': "Question: Select the humorous edit that 'ruins' the input movie or musical artist name. Which of the following is a humorous edit of this artist or movie name: 'radiohead'?\nOptions:\n(A) radio head\n(B) rawdiohead\n(C) radiodead\n(D) radiohesd\nAnswer: (C).\n\n",
'rational': '''
Summary of Learning:
1. Understand the Question:
- Recognize that the task is to select a humorous edit that alters the original name while retaining a recognizable connection to it.
2. Identify the Original Name:
- Clearly identify the original name provided (in this case, "radiohead").
3. Evaluate Each Option:
- Assess each option to see how it humorously changes the original name.
4. Select the Funniest Edit:
- Choose the option that is most humorous while still recognizable as a play on the original name.
Supplementary Knowledge:
1. Humor in Wordplay:
- Puns: A form of wordplay that exploits multiple meanings of a term or similar-sounding words.
- Phonetic Alterations: Changing the sounds of the original word to create humor.
2. Criteria for a Good Humorous Edit:
- Recognizability: The altered name should still be recognizable as a variant of the original name.
- Humor: The alteration should add a humorous twist, often by implying something funny or absurd.
3. Types of Humorous Edits:
- Literal Translations: Making a literal translation or interpretation of part of the name (e.g., "radio head").
- Phonetic Play: Changing the sound slightly to create a humorous effect (e.g., "radiohesd").
- Conceptual Twist: Introducing a concept that twists the meaning humorously (e.g., "radiodead" suggests the opposite of "radiohead").
4. Critical Thinking Strategies:
- Recognize Patterns: Notice common humorous patterns in altered names.
- Assess Humor Impact: Evaluate which option is the funniest based on common humor principles.
5. Example Analysis:
- Given Options:
- (A) radio head: A literal and slightly humorous change.
- (B) rawdiohead: A phonetic change, less recognizable.
- (C) radiodead: A humorous twist, suggesting the opposite of "radiohead."
- (D) radiohesd: A minor phonetic change, less impactful.
- Select (C) "radiodead" as it introduces a humorous and easily recognizable twist.''',
'answer': '(C)'
},
{
'question': "Question: Given a series of navigation instructions, determine whether one would end up back at the starting point. If you follow these instructions, do you return to the starting point? Take 3 steps. Take 6 steps. Take 8 steps. Take 5 steps. Take 6 steps. Take 2 steps. Take 1 step. Take 2 steps.\nOptions:\n- Yes\n- No\nAnswer: No.\n\n",
'rational': '''
Summary of Learning:
1. Understand the Question:
- Clearly understand what the question is asking. In this case, determine if the navigation instructions lead back to the starting point.
2. Analyze Each Instruction:
- Break down each step given in the instructions to understand the movement.
3. Track Position:
- Keep track of your position as you follow each instruction, either mentally or by writing it down.
4. Determine Final Position:
- Compare the final position with the starting point to see if they match.
Supplementary Knowledge:
1. Understanding Navigation Instructions:
- Steps in One Direction: Ensure all steps are considered in a straight line or in a specified direction.
- Returning to the Starting Point: Sum of all steps in opposite directions should equal zero for a return to the starting point.
2. Types of Navigation Movements:
- Linear Movements: Movements that happen in a straight line without turning.
- Directional Movements: Movements that involve turns or changes in direction (e.g., left, right).
3. Basic Math Concepts:
- Addition and Subtraction: Understanding how to add and subtract steps to find the final position.
- Net Movement: Concept of net movement to determine overall change in position.
4. Critical Thinking Strategies:
- Step-by-Step Analysis: Carefully follow each step one by one without skipping any.
- Summarize Movements: After tracking all steps, summarize the total movement to easily determine the end position.
- Decision Making: Based on the analysis, conclude whether the answer is "Yes" or "No."
5. Example Analysis:
- Given Instructions: Take 3 steps. Take 6 steps. Take 8 steps. Take 5 steps. Take 6 steps. Take 2 steps. Take 1 step. Take 2 steps.
- Net Calculation:
- Sum of steps: \(3 + 6 + 8 + 5 + 6 + 2 + 1 + 2 = 33\)
- Since there are no specific directions given, assume all steps are in the same direction.
- Net movement is 33 steps in one direction, indicating that the final position is not the starting point.''',
'answer': 'No'
},
]
bbh_error = [
{
'question': "The student was given the following question:\nThe following are multi-step reasoning math questions about gsm8k. Utilize mathematical and logical abilities to solve multi-step basic mathematical reasoning problems. Your response should conclude with the format \"Therefore, the answer is\". Question: DeShaun always wins the award in English class for reading the most books over the summer. This year is no different. Summer break is 80 days long. When he arrives, he notifies the teacher that he read 60 books over the summer. Each book averaged 320 pages long. The person closest to him read 75% as much as he did. How many pages did the second person read on average each day of break?\n",
'answer': "The student’s wrong response is:\n60*320 = <<60*320=19200>>19200 pages\n75% of 19200 = <<75*19200=14000>>14000 pages\nHe read 14000/80 = <<14000/80=175>>175 pages a day.\nTherefore, the answer is 175.\n",
'rational':'''
Learning Summary:
The student made an error in calculating 75% of the total number of pages DeShaun read. The incorrect calculation led to an erroneous conclusion about the average number of pages read per day by the second person. To correct this, the student needs to follow these steps:
1. Calculate the total number of pages DeShaun read: Multiply the number of books by the average number of pages per book.
2. Calculate 75% of the total pages: Apply the percentage correctly.
3. Calculate the average number of pages per day for the second person: Divide the total pages read by the number of days.
Supplementary Knowledge:
1. Percentage Calculation:
- Understanding that 75% means 75 out of 100 or 0.75 as a decimal.
- To find 75% of a number, multiply the number by 0.75.
2. Steps to Correct the Error:
- Step 1: Calculate the total pages DeShaun read: \( 60 \text{ books} \times 320 \text{ pages/book} = 19200 \text{ pages} \).
- Step 2: Calculate 75% of 19200 pages: \( 19200 \times 0.75 = 14400 \text{ pages} \).
- Step 3: Calculate the average number of pages read per day: \( 14400 \text{ pages} / 80 \text{ days} = 180 \text{ pages/day} \).
3. Logical Reasoning:
- Verify each step to ensure the calculations are accurate.
- Understanding how to break down a multi-step problem into manageable parts helps avoid errors and ensures the solution process is clear and logical.
Correct Solution:
1. Calculate the total number of pages DeShaun read:
\[
60 \text{ books} \times 320 \text{ pages/book} = 19200 \text{ pages}
\]
2. Calculate 75% of 19200 pages:
\[
19200 \times 0.75 = 14400 \text{ pages}
\]
3. Calculate the average number of pages read per day by the second person:
\[
14400 \text{ pages} / 80 \text{ days} = 180 \text{ pages/day}
\]
Therefore, the answer is 180.'''
},
{
'question': 'The student was given the following question:\nThe following are multiple choice questions (with answers) about logical_deduction_three_objects. A logical deduction task which requires deducing the order of a sequence of objects. Your response should conclude with the format \"Therefore, the answer is\".\n\nQuestion: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. A fruit stand sells three fruits: peaches, pears, and mangoes. The mangoes are less expensive than the peaches. The mangoes are more expensive than the pears.\nOptions:\n(A) The peaches are the second-most expensive\n(B) The pears are the second-most expensive\n(C) The mangoes are the second-most expensive\n',
'answer': 'The student’s wrong response is:\n1. The mangoes are less expensive than the peaches: \"mangoes < peaches\".\n2. The mangoes are more expensive than the pears: \"mangoes > pears\".\nFrom these two statements, we can conclude that \"pears < mangoes < peaches\".\nAccording to this ordering, the second-most expensive fruit is the pears.\nThe pears are the second-most expensive. Therefore, the answer is (B).\n',
'rational': '''
Learning Summary:
The student made an error in determining the second-most expensive fruit by misinterpreting the logical relationships given in the problem. The correct approach involves properly understanding the comparative prices of the fruits based on the provided statements. The key is to correctly order the fruits by price and identify the second-most expensive one.
Supplementary Knowledge:
1. Logical Deduction:
- When faced with comparative statements, carefully align each object in a sequence that satisfies all given conditions.
- Ensure that each step logically follows from the previous one and that all given relationships are accounted for.
2. Steps to Correct the Error:
- Step 1: Identify and write down the relationships from the statements:
- Mangoes are less expensive than peaches: \( \text{mangoes} < \text{peaches} \).
- Mangoes are more expensive than pears: \( \text{mangoes} > \text{pears} \).
- Step 2: Combine these relationships to form a complete order:
- From the given relationships: \( \text{pears} < \text{mangoes} < \text{peaches} \).
- Step 3: Determine the second-most expensive fruit from the ordered sequence:
- The sequence is \( \text{pears} < \text{mangoes} < \text{peaches} \).
- The second-most expensive fruit is the mangoes.
3. Critical Thinking:
- It is essential to double-check the logical consistency of the ordering.
- Visualizing or writing down the sequence can help avoid misinterpretation.
Correct Solution:
1. From the statements, we know:
- The mangoes are less expensive than the peaches: \( \text{mangoes} < \text{peaches} \).
- The mangoes are more expensive than the pears: \( \text{mangoes} > \text{pears} \).
2. Combining these two pieces of information:
\[
\text{pears} < \text{mangoes} < \text{peaches}
\]
3. From the ordered sequence, the second-most expensive fruit is:
\[
\text{mangoes}
\]
Therefore, the answer is (C).'''
}
]
bbh_err = [
{
'question': "The student was given the following question:\nThe following are multi-step reasoning math questions about gsm8k. Utilize mathematical and logical abilities to solve multi-step basic mathematical reasoning problems. Your response should conclude with the format \"Therefore, the answer is\". Question: DeShaun always wins the award in English class for reading the most books over the summer. This year is no different. Summer break is 80 days long. When he arrives, he notifies the teacher that he read 60 books over the summer. Each book averaged 320 pages long. The person closest to him read 75% as much as he did. How many pages did the second person read on average each day of break?\n",
'answer': "The student’s wrong response is:\n60*320 = <<60*320=19200>>19200 pages\n75% of 19200 = <<75*19200=14000>>14000 pages\nHe read 14000/80 = <<14000/80=175>>175 pages a day.\nTherefore, the answer is 175.\n",
'rational':'Learning Summary:\n1. Check Calculations Thoroughly: Always double-check your arithmetic operations to ensure accuracy. A single error in multiplication or division can lead to an incorrect answer.\n2. Understand Percentage Calculations: When dealing with percentages, ensure you understand how to correctly convert percentages to decimals and perform the necessary operations.\n3. Units Consistency: Keep track of units throughout your calculations. Ensure you are consistent in using and converting units where necessary.\n4. Logical Flow: Break down the problem into clear, logical steps. Ensure each step follows from the previous one and leads logically to the next.\n5. Final Answer Format: Conclude your answer with a clear statement in the format “Therefore, the answer is [answer]” to indicate completion.\n\nSupplementary Knowledge:\n1. Basic Arithmetic Operations:\n - Multiplication: Understand how to multiply numbers correctly, including large numbers.\n - Division: Be able to divide numbers accurately, ensuring the division makes sense within the context.\n2. Percentage Calculations:\n - Understanding Percentages: Know how to convert percentages to decimals (e.g., 75% = 0.75) and apply them in calculations.\n - Finding Percentages of a Number: Be able to calculate a percentage of a given number (e.g., 75% of 19200).\n3. Averages:\n - Calculating Averages: Understand how to find the average of a set of numbers, including dividing the total by the number of units (e.g., total pages read divided by the number of days).\n4. Units of Measurement:\n - Consistency in Units: Keep track of units (e.g., pages, days) throughout your calculations to ensure consistency and correctness.'
},
{
'question': 'The student was given the following question:\nThe following are multiple choice questions (with answers) about logical_deduction_three_objects. A logical deduction task which requires deducing the order of a sequence of objects. Your response should conclude with the format \"Therefore, the answer is\".\n\nQuestion: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. A fruit stand sells three fruits: peaches, pears, and mangoes. The mangoes are less expensive than the peaches. The mangoes are more expensive than the pears.\nOptions:\n(A) The peaches are the second-most expensive\n(B) The pears are the second-most expensive\n(C) The mangoes are the second-most expensive\n',
'answer': 'The student’s wrong response is:\n1. The mangoes are less expensive than the peaches: \"mangoes < peaches\".\n2. The mangoes are more expensive than the pears: \"mangoes > pears\".\nFrom these two statements, we can conclude that \"pears < mangoes < peaches\".\nAccording to this ordering, the second-most expensive fruit is the pears.\nThe pears are the second-most expensive. Therefore, the answer is (B).\n',
'rational': 'Learning Summary:\n1. Understand SVG Path Commands: Familiarize yourself with SVG path commands such as M (move to) and L (line to). Know what each command does and how they combine to form shapes.\n2. Identify Shape Properties: Recognize the properties of basic geometric shapes, such as the number of sides, equal-length sides, and internal angles.\n3. Process of Elimination: Use the process of elimination effectively by comparing the given options and ruling out those that do not fit the criteria derived from the problem.\n4. Double-Check Logic: Ensure the logical steps taken in identifying the shape are sound and based on the properties and definitions of the shapes.\n5. Conclude Clearly: Always conclude with a clear statement in the format “Therefore, the answer is [answer]” to indicate the final decision.\n\nSupplementary Knowledge:\n1. SVG Path Commands:\n - Move To (M): The M command moves the current point to the specified coordinates (x, y).\n - Line To (L): The L command draws a straight line from the current point to the specified coordinates (x, y).\n2. Basic Geometric Shapes:\n - Triangle: A shape with three sides and three angles.\n - Rectangle: A four-sided shape with opposite sides equal and all angles 90 degrees.\n - Kite: A four-sided shape with two pairs of adjacent sides that are equal.\n - Pentagon: A shape with five sides and five angles.\n - Hexagon: A shape with six sides and six angles.\n - Heptagon: A shape with seven sides and seven angles.\n - Octagon: A shape with eight sides and eight angles.\n - Circle: A round shape with all points equidistant from the center.\n - Line: A straight path that extends in both directions with no endpoints.\n - Sector: A portion of a circle, defined by two radii and the connecting arc.\n3. Properties of Shapes:\n - Number of Sides: Knowing how many sides a shape has can help identify it.\n - Side Lengths: Understanding if sides are equal or different in length is crucial for classification.\n - Angles: Knowing the internal angles can help differentiate between shapes, especially quadrilaterals.\n\nBy mastering these concepts and methods, students can improve their ability to solve similar multiple-choice questions about geometric shapes effectively.'
}
]
bbh_err_new = [
{
'question': "The student was given the following question:\nThe following are multiple choice questions (with answers) about elementary_mathematics. Your response should conclude with the format \"Therefore, the answer is\".\n\nQuestion: Carmen saved 592 pennies. Her sister saved 128 pennies. Together, they put 250 pennies in wrappers and took them to the bank. Whatis the total number of pennies, rounded to the nearest hundred, Carmen and her sister have left?\nA. 300 \nB. 500 \nC. 700 \nD. 1,000\n",
'answer': "The student’s wrong response is:\nFirst, let's calculate the total number of pennies Carmen and her sister have left after putting 250 pennies in wrappers and taking them to the bank. \nCarmen saved 592 pennies, her sister saved 128 pennies, and together, they put 250 pennies in wrappers, giving a total of 592 + 128 + 250 = 970 pennies.\n\nNext, we need to round this total number of pennies to the nearest hundred. Since 970 is less than 1,000, we round down to 300 as the total number of pennies Carmen and her sister have left.\n\nTherefore, the answer is A. 300.\n",
'rational':'Learning Summary:\n1. Careful Reading and Question Comprehension: Always read the question thoroughly to understand what is being asked, including any actions already taken (e.g., pennies put in wrappers).\n2. Correct Mathematical Operations: Ensure that addition or subtraction operations reflect the sequence of events described in the question.\n3. Rounding Rules: Round numbers according to the provided guidelines. For rounding to the nearest hundred, if the tens digit is 5 or more, round up; if less than 5, round down.\n4. Verification of Calculation: Double-check arithmetic operations to ensure accuracy, especially with multiple steps.\n\nSupplementary Knowledge:\n1. Basic Arithmetic Operations: Addition and subtraction are fundamental for solving such problems. Adding amounts to find a total and subtracting what has been used or taken away is crucial.\n2. Rounding Numbers: Understand how to round numbers to the nearest ten, hundred, thousand, etc. If the digit immediately following the rounding position is 5 or greater, round up; if less than 5, round down.\n3. Handling Currency: Recognize currency denominations such as pennies and understand their accumulation for managing money-related problems.\n4. Problem-Solving Strategy: Use a step-by-step approach to problem-solving, including understanding the question, performing calculations, checking the work, and presenting the answer clearly.'
},
{
'question': 'The student was given the following question:\nThe following are multi-step reasoning math questions about gsm8k. Utilize mathematical and logical abilities to solve multi-step basic mathematical reasoning problems. Your response should conclude with the format \"Therefore, the answer is\".\n\nQuestion: Mary and Jimmy start running from the same spot, but in opposite directions. Mary runs at 5 miles per hour and Jimmy runs at 4 miles per hour. What is the distance between them after 1 hour?\n',
'answer': 'The student’s wrong response is:\n1 hour is 60 minutes.\nMary moves 5 miles/hour*60 minutes=<<5*60=300>>300 miles in 1 hour.\nJimmy moves 4 miles/hour*60 minutes=<<4*60=240>>240 miles in 1 hour.\nThe distance between them is 300 miles-240 miles=<<300-240=60>>60 miles.\nTherefore, the answer is 60.\n',
'rational': "Learning Summary:\n1. Speed and Direction Consideration: When dealing with two objects moving in opposite directions, correctly incorporate their speeds and directions to determine their relative distance.\n2. Unit Conversion Accuracy: Be precise in converting units (e.g., hours to minutes) to match the given parameters and ensure consistency throughout the calculation.\n3. Determine the Direction: Based on the question, decide whether to subtract distances for the same direction or add them for opposite directions.\n4. Check Final Result: Always review your final calculation to ensure it aligns with the question's requirements and makes logical sense.\n5. Accuracy in Arithmetic Operations: Maintain accuracy in mathematical operations, especially when subtracting or comparing values.\n\nSupplementary Knowledge: \n1. Speed and Distance Relationships: Know the formula for calculating distance (distance = speed × time). For objects starting from the same point and moving in the same direction, the difference in their distances is the difference in their speeds multiplied by the time. For objects moving in opposite directions, the total distance is the sum of their speeds multiplied by the time.\n2. Time Conversions: 1 hour equals 60 minutes, and 1 minute equals 60 seconds. Convert time units as needed based on the problem.\n3. Problem Solving Strategy: Use a systematic approach to problem-solving by breaking down tasks into manageable steps for efficient and accurate solutions."
}
]
class TimeoutException(Exception):
def __init__(self, msg=''):
self.msg = msg
@contextmanager
def time_limit(seconds, msg=''):
timer = threading.Timer(seconds, lambda: _thread.interrupt_main())
timer.start()
try:
yield
except KeyboardInterrupt:
raise TimeoutException("Timed out for operation {}".format(msg))
finally:
# if the action ends in specified time, timer is canceled
timer.cancel()
def add_prompt(item, prompt):
def rmreturn(s):
s = s.replace('\n\n', ' ')
s = s.replace('\n', ' ')
return s.strip()
choice = "\n{choice}. {content}"
query = item['question']
candidates = ' '.join([choice.format(choice=chr(ord("A") + id), content=ch) for id, ch in enumerate(item["choices"])])
prompt = prompt.replace('{query}', query + candidates)
if item.get('output'): # background info
backinfo = rmreturn(item['output'][0])
prompt = prompt.replace('{background}', backinfo)
return prompt
def clustering_prompt(items, prompt):
def rmreturn(s):
s = s.replace('\n\n', ' ')
s = s.replace('\n', ' ')
return s.strip()
cluster_prompts = []
for item in items:
query = item['question']
backinfo = rmreturn(item['output'][0])
item_prompt = prompt.replace('{query}', query)
item_prompt += f' {backinfo}'
cluster_prompts.append(item_prompt)
cluster_prompts.append(prompt)
return ' \n\n\n\n '.join(cluster_prompts)
def run_embeddings(input_text, engine='text-similarity-davinci-001'):
texts = [t.replace('\n', '') for t in input_text]
outputs = openai.Embedding.create(input=texts, model=engine)['data']
embeddings = [o['embedding'] for o in outputs]
return embeddings
def run_inference(inputs_with_prompts, engine, max_tokens, num_sequence=1, temp=0):
completions = {"choices": []}
# for _ in range(200):
# try:
# with time_limit(20, 'run gpt-4'):
# completions = openai.chat.completions.create(
# model="gpt-4", # use gpt-4
# messages=[
# {"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": inputs_with_prompts}
# ]
# )
# break
# except:
# time.sleep(2)
for _ in range(5):
try:
completions = openai.chat.completions.create(
model="gpt-4", # use gpt4
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": inputs_with_prompts}
]
)
break
except Exception as e:
print(e)
print(inputs_with_prompts)
time.sleep(2)
if len(completions['choices']) == 0:
return None
outputs = completions.choices[0].message.content
return outputs
def predict(params):
prompt, query, engine, datatype, n, full = params
label = 1
if datatype == 'temp':
question = query['instruction']
# query['task_name'] = 'multi-step_math'
# query['task_description'] = query['task_discription']
# del query['task_discription']
fewshot = ''
if full == 1:
bbh = bbh_temp_full
else:
bbh = bbh_temp
for ex in bbh[:n]:
fewshot += ex['question'] + ex['rational'] + '\n\n'
input_with_prompt = prompt.format(prefix=query['prefix_prompt'], task=query['task_description'], query=question, fewshot=fewshot, answer=query['output'])
# query['input_with_prompt'] = input_with_prompt
elif datatype == 'temp_error':
question = query['question']
# query['task_name'] = 'multi-step_math'
# query['task_description'] = query['task_discription']
# del query['task_discription']
bbh = bbh_err_new
if n == 0:
fewshot = '\n\n'
else:
fewshot=''
for i, ex in enumerate(bbh[:n]):
fewshot += f'\n\nExample {i}' + ex['question'] + '\n' + ex['answer'] + '\n' + ex['rational']
if n != 0:
fewshot += '\n\n'
question = question.split('\nAnswer: ')[0]
response = query['stu_response']
if '? ? ? ? ? ? ? ? ? ? ?' in response:
response = response.split('? ? ? ? ? ? ? ? ? ? ?')[0]
if '| | | | | | | | | | | | |' in response:
response = response.split('| | | | | | | | | | | | |')[0]
if 'per day per day per day per day' in response:
response = response.split('per day per day per day per day')[0]
if '\t\t\t\t\t\t\t\t\t\t\t\t\t\t' in response:
response = response.split('\t\t\t\t\t\t\t\t\t\t\t\t\t\t')[0]
if '+ True + True + True + True' in response:
response = response.split('+ True + True + True + True')[0]
if '888888888' in response:
response = response.split('888888888')[0]
if '0000000000000000000' in response:
response = response.split('0000000000000000000')[0]
if '0, 0, 0, 0, 0, 0, 0, 0, 0, 0' in response:
response = response.split('0, 0, 0, 0, 0, 0, 0, 0, 0, 0')[0]
if '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd' in response:
response = response.split('\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd')[0]
if 'not not not not not not not' in response:
response = response.split('not not not not not not not')[0]
response = ' '.join(response.split(' ')[:256])
input_with_prompt = prompt.format(question=question, fewshot=fewshot, response=response)
else:
# input_with_prompt = add_prompt(query, prompt)
question = query['instruction']
# choice = "\n{choice}. {content}"
# candidates = ' '.join([choice.format(choice=chr(ord("A") + id), content=query[chr(ord("A") + id)]) for id in range(4)])
# input_with_prompt = prompt.format(subject=query['subject'], query=question)
input_with_prompt = prompt.format(query=question)
label = 1
# query['instruction'] = question + candidates
# query['task_name'] = query['subject']
# query['task_description'] = ''
# query['output'] = query['answer']
# query['input'] = ''
retry_count = 15
retry_interval = 5
if 'yi' in engine:
API_KEY = ""
API_BASE = "https://api.lingyiwanwu.com/v1"
elif 'gpt' in engine:
API_KEY = ""
# openai.base_url = url
API_BASE = ''
elif 'glm' in engine:
API_KEY = ""
# openai.base_url = url
API_BASE = 'https://open.bigmodel.cn/api/paas/v4/'
else:
API_KEY = ""
API_BASE = f'https://cloud.infini-ai.com/maas/{engine}/nvidia/'
client = OpenAI(
api_key=API_KEY,
base_url=API_BASE
)
if label == 0:
if 'Therefore, the answer is' not in query['response']:
label = 1
else:
response = query['response'].split('Therefore, the answer is')[1].split('.')[0].strip().rstrip('.')
if response[0] != query['output']:
label = 1
if label:
for _ in range(retry_count):
try:
response = client.chat.completions.create(
model=engine, # use gpt4
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": input_with_prompt}
],
temperature=0.8,
# stream=True
)
msg = response.choices[0].message.content.strip()
query['summary'] = msg
return question, query
except TimeoutError:
print("Timeout", question)
retry_count += 1
# retry_interval *= 2
time.sleep(retry_interval)
except Exception as e:
print(e)
# print(question)
retry_count += 1
# retry_interval *= 2
time.sleep(retry_interval)
query['summary'] = 'api defeat'
else:
query['summary'] = query['response']
return question, query
def run_main(inlines, outfile, engine, prompt, datatype, n=0, temp=0, full=0):
if os.path.exists(outfile):
outs = open(outfile, 'a', encoding='utf8')
num_lines = len(open(outfile, 'r').readlines())
inlines = inlines[num_lines:]
else: # not os.path.exists(outfile)
outs = open(outfile, 'a', encoding='utf8')
# outs.write(json.dumps({"prompt": prompt}) + '\n')
# inlines = inlines[:4]
# inlines = random.sample(inlines, min(len(inlines), 4000))
pbar = tqdm(total=len(inlines))
index = 0
pbar.update(index)
# predict((prompt, inlines[0], engine, datatype, n, full))
with ProcessPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(predict, (prompt, query, engine, datatype, n, full)) for query in inlines]
query2res = collections.defaultdict(int)
results = []
for job in as_completed(futures):
query, res = job.result(timeout=None)
# res['response'] = res['summary']
# del res['summary']
results.append(res)
# query2res[query] = res
pbar.update(1)
# results = []
# for k, v in query2res.items():
# results.append(v)
# with open(os.path.join(self.eval_args.save_dir, "results.json"), "w", encoding="utf-8", newline="\n") as f:
json.dump(results, outs, indent=4)
pbar.close()
outs.close()
if datatype == 'question_answering':
with open(outfile, 'r') as f:
results = json.load(f)
correct_count = 0
for res in results:
answer = res['summary']
if 'gsm8k' in res['task_name'] or 'GSM' in res['task_name'] or 'MATH' in res['task_name']:
pass
else:
if 'the answer is:\n\n' in answer:
answer = answer.split('the answer is:\n\n')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'the answer is:' in answer:
answer = answer.split('the answer is:')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'the answer is' in answer:
answer = answer.split('the answer is')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'answer is' in answer:
answer = answer.split('answer is')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'is that' in answer:
answer = answer.split('is that')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'is:\n' in answer:
answer = answer.split('is:\n')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'is:' in answer:
answer = answer.split('is:')[-1].split('.')[0].split('\n\n')[0].strip()
elif ' be ' in answer:
answer = answer.split(' be ')[-1].split('.')[0].split('\n\n')[0].strip()
elif ' is ' in answer:
answer = answer.split(' is ')[-1].split('.')[0].split('\n\n')[0].strip()
elif 'option' in answer:
answer = answer.split('option')[-1].split('.')[0].split(' ')[0].split(':')[0].split('\n\n')[0].strip()
elif 'is' in answer:
answer = answer.split('is')[-1].split('.')[0].split('\n\n')[0].strip()
else:
answer = answer.split('answer is')[-1].split('.')[0].split('\n\n')[0].strip()
if res['task_name'] != 'formal_fallacies':
if 'gsm8k' in res['task_name'] or 'GSM' in res['task_name'] or 'MATH' in res['task_name']:
gt_choice, gt_content = extract_answer(copy.deepcopy(res['output']))
md_choice, md_content = extract_answer(copy.deepcopy(answer))
else:
gt_choice, gt_content = extract_answers_for_gt(copy.deepcopy(res['output']))
md_choice, md_content = extract_answers_for_model(copy.deepcopy(answer))
if md_choice and md_choice != 'none' and gt_choice and gt_choice != 'none':
if md_choice in gt_choice:
# check choice
right_scores = 1
correct_count += 1
else:
right_scores = 0
else:
# check content
if md_content == 'none' or md_content == '':
right_scores = 0
elif gt_content == md_content:
right_scores = 1
correct_count += 1
else:
right_scores = 0
else:
gt_choice = 'none'
gt_content = res['output'].lower().strip()
md_choice = 'none'
md_content = answer.lower().strip()
if decide(ground_truth=gt_content, model_answer=md_content):
right_scores = 1
correct_count += 1
else:
right_scores = 0
res['formatted_gt_answer'] = f'Choice:{gt_choice}, Content:{gt_content}'
res['formatted_extracted_model_answers'] = f'Choice:{md_choice}, Content:{md_content}'
res['stu_right_score'] = right_scores
total_num = len(results)
metricss = {
'description': 'macro test results',
"right_num": correct_count,
"total_num": total_num,
"total_accuracy": correct_count / total_num,
}
xxxx = outfile.split('.json')[0]
with open(f'{xxxx}_res.json', 'w') as f:
json.dump([metricss, results], f, indent=4)
def decide(ground_truth: str, model_answer: str):
# for formal_fallacies task check
if 'invalid' in ground_truth:
if 'invalid' in model_answer:
return True
else:
return False
else:
if 'invalid' in model_answer:
return False
else:
return True
# question, query = k, v
# outs.write(json.dumps(query) + '\n')
# import pandas as pd
# df = pd.DataFrame({'query': list(query2res.keys()), 'infer_result': list(query2res.values())})
# df.to_excel('./chatgpt_infer_result.xlsx', index=False)
# outs.write(json.dumps({
# 'question': inputs[0],
# 'answer': answers[0],
# 'label': label,
# 'output': outputs})
# + '\n')
# while index < len(inlines):
# inputs, answers = [], []
# inputs_with_prompts = []
# # for _ in range(1):
# # if index >= len(inlines): break
# # input_with_prompt = add_prompt(inlines[index], prompt)
# # inputs.append(inlines[index]['question']) ## a string
# # answers.append(inlines[index]['answer']) ## a list of strings
# # inputs_with_prompts.append(input_with_prompt)
# # index += 1
# input_with_prompt = add_prompt(inlines[index], prompt)
# inputs.append(input_with_prompt) # a string
# answers.append(inlines[index]['answer']) # a list of strings
# label = chr(ord("A") + inlines[index]
# ['choices'].index(inlines[index]['answer']))
# # samples = defaultdict(list)
# outputs = run_inference(input_with_prompt,
# engine, max_tokens, n, temp)
# if not outputs:
# continue
# # for j, output in enumerate(outputs):
# # samples[j//n].append(output)
# outs.write(json.dumps({
# 'question': inputs[0],
# 'answer': answers[0],
# 'label': label,
# 'output': outputs})
# + '\n')
# index += 1
# pbar.update(1)