From 0eada4854277cb88a188794637354ddfd3b17cfb Mon Sep 17 00:00:00 2001
From: qbc <qianbingchen.qbc@alibaba-inc.com>
Date: Wed, 10 Apr 2024 10:38:29 +0800
Subject: [PATCH 1/4] add rougel for dolly

---
 .../llm/eval/eval_for_rougel/eval.py          | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 federatedscope/llm/eval/eval_for_rougel/eval.py

diff --git a/federatedscope/llm/eval/eval_for_rougel/eval.py b/federatedscope/llm/eval/eval_for_rougel/eval.py
new file mode 100644
index 000000000..6649a47f8
--- /dev/null
+++ b/federatedscope/llm/eval/eval_for_rougel/eval.py
@@ -0,0 +1,79 @@
+import os
+
+import numpy as np
+import transformers
+from tqdm import tqdm
+from rouge import Rouge
+
+from federatedscope.core.configs.config import global_cfg
+from federatedscope.core.cmd_args import parse_args, parse_client_cfg
+from federatedscope.core.auxiliaries.utils import setup_seed
+from federatedscope.core.auxiliaries.logging import update_logger
+from federatedscope.core.data.utils import download_url
+from federatedscope.llm.dataloader.dataloader import load_jsonl
+from federatedscope.llm.misc.fschat import FSChatBot
+
+transformers.logging.set_verbosity(40)
+
+DEBUG = False
+
+rouge = Rouge()
+
+
+def rouge_score(hyps, refs):
+    try:
+        rouge_score = rouge.get_scores(hyps, refs)[0]['rouge-l']['f']
+    except ValueError:
+        return 0.0
+    return rouge_score
+
+
+def main():
+    init_cfg = global_cfg.clone()
+    args = parse_args()
+
+    if args.cfg_file:
+        init_cfg.merge_from_file(args.cfg_file)
+    cfg_opt, client_cfg_opt = parse_client_cfg(args.opts)
+    init_cfg.merge_from_list(cfg_opt)
+
+    update_logger(init_cfg, clear_before_add=True)
+    setup_seed(init_cfg.seed)
+
+    # load your finetuned model (saved as xxx.ckpt)
+    #    in yaml file federate.save_to
+    fschatbot = FSChatBot(init_cfg)
+
+    # Get test file
+    fp = os.path.join(init_cfg.data.root, "databricks-dolly-15k.jsonl")
+    if not os.path.exists(fp):
+        download_url(
+            'https://raw.githubusercontent.com/databrickslabs'
+            '/dolly/d000e3030970379aabbf6d291f50ffdd3b715b64'
+            '/data/databricks-dolly-15k.jsonl', init_cfg.data.root)
+        os.rename(os.path.join(init_cfg.data.root, 'test.jsonl'), fp)
+
+    list_data_dict = load_jsonl(fp,
+                                instruction='instruction',
+                                input='context',
+                                output='response',
+                                category='category')
+    answers = []
+    for sample in tqdm(list_data_dict):
+        input_text = sample['instruction']
+        generate_kwargs = dict(max_new_tokens=256, top_p=0.95, temperature=0.8)
+        model_answer = fschatbot.generate(input_text, generate_kwargs)
+
+        rougel_cor = rouge_score(model_answer, sample['output'])
+        answers.append(rougel_cor)
+        if DEBUG:
+            print(f'Full input_text:\n{input_text}\n\n')
+        print(f'Question: {sample["instruction"]}\n\n'
+              f'Answers: {model_answer}\n\n')
+
+        print(f'Num of total question: {len(answers)}, '
+              f'Average score: {np.average(answers)}.')
+
+
+if __name__ == "__main__":
+    main()

From 1fbdb8bb8b383d7f3639d848c20be219c828a418 Mon Sep 17 00:00:00 2001
From: qbc <qianbingchen.qbc@alibaba-inc.com>
Date: Wed, 10 Apr 2024 11:05:59 +0800
Subject: [PATCH 2/4] add readme, eval for the summarization task

---
 federatedscope/llm/eval/eval_for_rougel/README.md | 15 +++++++++++++++
 .../llm/eval/eval_for_rougel/__init__.py          |  0
 federatedscope/llm/eval/eval_for_rougel/eval.py   |  4 ++++
 3 files changed, 19 insertions(+)
 create mode 100644 federatedscope/llm/eval/eval_for_rougel/README.md
 create mode 100644 federatedscope/llm/eval/eval_for_rougel/__init__.py

diff --git a/federatedscope/llm/eval/eval_for_rougel/README.md b/federatedscope/llm/eval/eval_for_rougel/README.md
new file mode 100644
index 000000000..305d728b0
--- /dev/null
+++ b/federatedscope/llm/eval/eval_for_rougel/README.md
@@ -0,0 +1,15 @@
+# Rouge-L
+
+To assess the performance of our fine-tuned model, we leverage the Rouge-L 
+metric and conduct experiments with a large number of clients, utilizing the 
+Dolly-15K dataset as our training corpus. The Dolly-15K dataset encompasses 
+a total of 15,015 data points, distributed across eight distinct tasks. For 
+a more comprehensive evaluation, we allocate the final task exclusively for 
+evaluation purposes, while dedicating the remaining ones to the training 
+phase. Our experimental setup involves a network of 200 clients, utilizing a Dirichlet distribution for data partitioning to emulate non-IID conditions across the client base.
+
+To do the evaluation, run
+```bash
+python federatescope/eval/eval_for_rougel/eval.py --cfg 
+federatescope/llm/baselime/xxx.yaml
+```
\ No newline at end of file
diff --git a/federatedscope/llm/eval/eval_for_rougel/__init__.py b/federatedscope/llm/eval/eval_for_rougel/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/federatedscope/llm/eval/eval_for_rougel/eval.py b/federatedscope/llm/eval/eval_for_rougel/eval.py
index 6649a47f8..6efc8fd78 100644
--- a/federatedscope/llm/eval/eval_for_rougel/eval.py
+++ b/federatedscope/llm/eval/eval_for_rougel/eval.py
@@ -58,6 +58,10 @@ def main():
                                 input='context',
                                 output='response',
                                 category='category')
+
+    list_data_dict = [
+        x for x in list_data_dict if x["category"] == "summarization"
+    ]
     answers = []
     for sample in tqdm(list_data_dict):
         input_text = sample['instruction']

From 546224b26680bc5e04ffd96b018e0cd5658157cd Mon Sep 17 00:00:00 2001
From: qbc <qianbingchen.qbc@alibaba-inc.com>
Date: Thu, 11 Apr 2024 11:55:38 +0800
Subject: [PATCH 3/4] add eval for NI dataset

---
 .../llm/eval/eval_for_rougel/eval_ni.py       | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 federatedscope/llm/eval/eval_for_rougel/eval_ni.py

diff --git a/federatedscope/llm/eval/eval_for_rougel/eval_ni.py b/federatedscope/llm/eval/eval_for_rougel/eval_ni.py
new file mode 100644
index 000000000..ce0510769
--- /dev/null
+++ b/federatedscope/llm/eval/eval_for_rougel/eval_ni.py
@@ -0,0 +1,118 @@
+import json
+import os
+import random
+
+import numpy as np
+import transformers
+from tqdm import tqdm
+from rouge import Rouge
+
+from federatedscope.core.configs.config import global_cfg
+from federatedscope.core.cmd_args import parse_args, parse_client_cfg
+from federatedscope.core.auxiliaries.utils import setup_seed
+from federatedscope.core.auxiliaries.logging import update_logger
+from federatedscope.core.data.utils import download_url
+from federatedscope.llm.misc.fschat import FSChatBot
+
+transformers.logging.set_verbosity(40)
+
+DEBUG = False
+
+rouge = Rouge()
+
+
+def rouge_score(hyps, refs):
+    try:
+        rouge_score = rouge.get_scores(hyps, refs)[0]['rouge-l']['f']
+    except ValueError:
+        return 0.0
+    return rouge_score
+
+
+def load_data(file_path,
+              instruction='instruction',
+              input='input',
+              output='output',
+              category='category'):
+
+    # Format: [{'instruction': ..., 'input': ..., 'output':...}]
+    with open(file_path, 'r', encoding="utf-8") as f:
+        list_data_dict = json.load(f)
+
+    # Replace key
+    new_list_data_dict = []
+    list_data_dict = list_data_dict["Instances"]
+
+    num_samples = int(len(list_data_dict) * 0.02)
+    chosen_list_data_dict = random.sample(list_data_dict, num_samples)
+
+    for item in chosen_list_data_dict:
+        new_item = dict(
+            instruction=item[instruction] if instruction in item else None,
+            input=item[input] if input in item else None,
+            output=item[output][0] if output in item else None,
+            category=item[category] if category in item else None)
+        new_list_data_dict.append(new_item)
+    return new_list_data_dict
+
+
+def main():
+    init_cfg = global_cfg.clone()
+    args = parse_args()
+
+    if args.cfg_file:
+        init_cfg.merge_from_file(args.cfg_file)
+    cfg_opt, client_cfg_opt = parse_client_cfg(args.opts)
+    init_cfg.merge_from_list(cfg_opt)
+
+    update_logger(init_cfg, clear_before_add=True)
+    setup_seed(init_cfg.seed)
+
+    # load your finetuned model (saved as xxx.ckpt)
+    #    in yaml file federate.save_to
+    fschatbot = FSChatBot(init_cfg)
+
+    test_tasks_fp = os.path.join(
+        init_cfg.data.root,
+        "natural-instructions-2.8/splits/xlingual/test_tasks.txt")
+
+    if not os.path.exists(test_tasks_fp):
+        download_url(
+            'https://github.com/allenai/natural-instructions/archive/refs'
+            '/tags/v2.8.zip', init_cfg.data.root)
+        print("Please unzip the data, and rerun.")
+        return
+
+    test_tasks = []
+    with open(test_tasks_fp, 'r') as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            test_tasks.append(line.strip())
+
+    list_data_dict = []
+    for task in test_tasks:
+        fp = os.path.join(init_cfg.data.root, "natural-instructions-2.8/tasks",
+                          task + ".json")
+        list_data_dict.extend(load_data(fp))
+
+    answers = []
+    for sample in tqdm(list_data_dict):
+        input_text = sample['input']
+        generate_kwargs = dict(max_new_tokens=256, top_p=0.95, temperature=0.8)
+        model_answer = fschatbot.generate(input_text, generate_kwargs)
+
+        rougel_cor = rouge_score(model_answer, sample['output'])
+        answers.append(rougel_cor)
+        if DEBUG:
+            print(f'Full input_text:\n{input_text}\n\n')
+        print(f'Question: {sample["input"]}\n\n'
+              f'Answers: {model_answer}\n\n')
+
+        print(f'Num of total question: {len(answers)}, '
+              f'Average score: {np.average(answers)}.')
+
+
+if __name__ == "__main__":
+    main()

From 38757f815afbccc8900d63d3fb66e6a0e0e6f32a Mon Sep 17 00:00:00 2001
From: qbc <qianbingchen.qbc@alibaba-inc.com>
Date: Thu, 11 Apr 2024 12:09:01 +0800
Subject: [PATCH 4/4] add readme

---
 .../llm/eval/eval_for_rougel/README.md        | 20 +++++++++++--------
 .../{eval.py => eval_dolly.py}                |  0
 2 files changed, 12 insertions(+), 8 deletions(-)
 rename federatedscope/llm/eval/eval_for_rougel/{eval.py => eval_dolly.py} (100%)

diff --git a/federatedscope/llm/eval/eval_for_rougel/README.md b/federatedscope/llm/eval/eval_for_rougel/README.md
index 305d728b0..d3da1a99c 100644
--- a/federatedscope/llm/eval/eval_for_rougel/README.md
+++ b/federatedscope/llm/eval/eval_for_rougel/README.md
@@ -1,15 +1,19 @@
 # Rouge-L
 
+## Dolly-15K
 To assess the performance of our fine-tuned model, we leverage the Rouge-L 
-metric and conduct experiments with a large number of clients, utilizing the 
-Dolly-15K dataset as our training corpus. The Dolly-15K dataset encompasses 
-a total of 15,015 data points, distributed across eight distinct tasks. For 
-a more comprehensive evaluation, we allocate the final task exclusively for 
-evaluation purposes, while dedicating the remaining ones to the training 
-phase. Our experimental setup involves a network of 200 clients, utilizing a Dirichlet distribution for data partitioning to emulate non-IID conditions across the client base.
+metric and conduct experiments with a large number of clients, utilizing the Dolly-15K dataset as our training corpus. 
+The Dolly-15K dataset encompasses a total of 15,015 data points, distributed across eight distinct tasks. For a more comprehensive evaluation, we allocate the final task exclusively for evaluation purposes, while dedicating the remaining ones to the training phase. Our experimental setup involves a network of 200 clients, utilizing a Dirichlet distribution for data partitioning to emulate non-IID conditions across the client base.
 
 To do the evaluation, run
 ```bash
-python federatescope/eval/eval_for_rougel/eval.py --cfg 
-federatescope/llm/baselime/xxx.yaml
+python federatescope/eval/eval_for_rougel/eval_dolly.py --cfg federatescope/llm/baselime/xxx.yaml
+```
+
+## Natural Instructions
+We also leverage the Rouge-L metric and conduct experiments with a large number of clients, utilizing the Natural Instructions (NI) dataset as our training corpus.  In the NI dataset, we allocate each of the 738 training tasks exclusively to a distinct client for model training, thereby cultivating a non-IID setting characterized by feature distribution skew. Meanwhile, evaluation is performed on separate test tasks.
+
+To do the evaluation, run
+```bash
+python federatescope/eval/eval_for_rougel/eval_ni.py --cfg federatescope/llm/baselime/xxx.yaml
 ```
\ No newline at end of file
diff --git a/federatedscope/llm/eval/eval_for_rougel/eval.py b/federatedscope/llm/eval/eval_for_rougel/eval_dolly.py
similarity index 100%
rename from federatedscope/llm/eval/eval_for_rougel/eval.py
rename to federatedscope/llm/eval/eval_for_rougel/eval_dolly.py