From 0eada4854277cb88a188794637354ddfd3b17cfb Mon Sep 17 00:00:00 2001 From: qbc Date: Wed, 10 Apr 2024 10:38:29 +0800 Subject: [PATCH 1/4] add rougel for dolly --- .../llm/eval/eval_for_rougel/eval.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 federatedscope/llm/eval/eval_for_rougel/eval.py diff --git a/federatedscope/llm/eval/eval_for_rougel/eval.py b/federatedscope/llm/eval/eval_for_rougel/eval.py new file mode 100644 index 000000000..6649a47f8 --- /dev/null +++ b/federatedscope/llm/eval/eval_for_rougel/eval.py @@ -0,0 +1,79 @@ +import os + +import numpy as np +import transformers +from tqdm import tqdm +from rouge import Rouge + +from federatedscope.core.configs.config import global_cfg +from federatedscope.core.cmd_args import parse_args, parse_client_cfg +from federatedscope.core.auxiliaries.utils import setup_seed +from federatedscope.core.auxiliaries.logging import update_logger +from federatedscope.core.data.utils import download_url +from federatedscope.llm.dataloader.dataloader import load_jsonl +from federatedscope.llm.misc.fschat import FSChatBot + +transformers.logging.set_verbosity(40) + +DEBUG = False + +rouge = Rouge() + + +def rouge_score(hyps, refs): + try: + rouge_score = rouge.get_scores(hyps, refs)[0]['rouge-l']['f'] + except ValueError: + return 0.0 + return rouge_score + + +def main(): + init_cfg = global_cfg.clone() + args = parse_args() + + if args.cfg_file: + init_cfg.merge_from_file(args.cfg_file) + cfg_opt, client_cfg_opt = parse_client_cfg(args.opts) + init_cfg.merge_from_list(cfg_opt) + + update_logger(init_cfg, clear_before_add=True) + setup_seed(init_cfg.seed) + + # load your finetuned model (saved as xxx.ckpt) + # in yaml file federate.save_to + fschatbot = FSChatBot(init_cfg) + + # Get test file + fp = os.path.join(init_cfg.data.root, "databricks-dolly-15k.jsonl") + if not os.path.exists(fp): + download_url( + 'https://raw.githubusercontent.com/databrickslabs' + '/dolly/d000e3030970379aabbf6d291f50ffdd3b715b64' + '/data/databricks-dolly-15k.jsonl', init_cfg.data.root) + os.rename(os.path.join(init_cfg.data.root, 'test.jsonl'), fp) + + list_data_dict = load_jsonl(fp, + instruction='instruction', + input='context', + output='response', + category='category') + answers = [] + for sample in tqdm(list_data_dict): + input_text = sample['instruction'] + generate_kwargs = dict(max_new_tokens=256, top_p=0.95, temperature=0.8) + model_answer = fschatbot.generate(input_text, generate_kwargs) + + rougel_cor = rouge_score(model_answer, sample['output']) + answers.append(rougel_cor) + if DEBUG: + print(f'Full input_text:\n{input_text}\n\n') + print(f'Question: {sample["instruction"]}\n\n' + f'Answers: {model_answer}\n\n') + + print(f'Num of total question: {len(answers)}, ' + f'Average score: {np.average(answers)}.') + + +if __name__ == "__main__": + main() From 1fbdb8bb8b383d7f3639d848c20be219c828a418 Mon Sep 17 00:00:00 2001 From: qbc Date: Wed, 10 Apr 2024 11:05:59 +0800 Subject: [PATCH 2/4] add readme, eval for the summarization task --- federatedscope/llm/eval/eval_for_rougel/README.md | 15 +++++++++++++++ .../llm/eval/eval_for_rougel/__init__.py | 0 federatedscope/llm/eval/eval_for_rougel/eval.py | 4 ++++ 3 files changed, 19 insertions(+) create mode 100644 federatedscope/llm/eval/eval_for_rougel/README.md create mode 100644 federatedscope/llm/eval/eval_for_rougel/__init__.py diff --git a/federatedscope/llm/eval/eval_for_rougel/README.md b/federatedscope/llm/eval/eval_for_rougel/README.md new file mode 100644 index 000000000..305d728b0 --- /dev/null +++ b/federatedscope/llm/eval/eval_for_rougel/README.md @@ -0,0 +1,15 @@ +# Rouge-L + +To assess the performance of our fine-tuned model, we leverage the Rouge-L +metric and conduct experiments with a large number of clients, utilizing the +Dolly-15K dataset as our training corpus. The Dolly-15K dataset encompasses +a total of 15,015 data points, distributed across eight distinct tasks. For +a more comprehensive evaluation, we allocate the final task exclusively for +evaluation purposes, while dedicating the remaining ones to the training +phase. Our experimental setup involves a network of 200 clients, utilizing a Dirichlet distribution for data partitioning to emulate non-IID conditions across the client base. + +To do the evaluation, run +```bash +python federatescope/eval/eval_for_rougel/eval.py --cfg +federatescope/llm/baselime/xxx.yaml +``` \ No newline at end of file diff --git a/federatedscope/llm/eval/eval_for_rougel/__init__.py b/federatedscope/llm/eval/eval_for_rougel/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/federatedscope/llm/eval/eval_for_rougel/eval.py b/federatedscope/llm/eval/eval_for_rougel/eval.py index 6649a47f8..6efc8fd78 100644 --- a/federatedscope/llm/eval/eval_for_rougel/eval.py +++ b/federatedscope/llm/eval/eval_for_rougel/eval.py @@ -58,6 +58,10 @@ def main(): input='context', output='response', category='category') + + list_data_dict = [ + x for x in list_data_dict if x["category"] == "summarization" + ] answers = [] for sample in tqdm(list_data_dict): input_text = sample['instruction'] From 546224b26680bc5e04ffd96b018e0cd5658157cd Mon Sep 17 00:00:00 2001 From: qbc Date: Thu, 11 Apr 2024 11:55:38 +0800 Subject: [PATCH 3/4] add eval for NI dataset --- .../llm/eval/eval_for_rougel/eval_ni.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 federatedscope/llm/eval/eval_for_rougel/eval_ni.py diff --git a/federatedscope/llm/eval/eval_for_rougel/eval_ni.py b/federatedscope/llm/eval/eval_for_rougel/eval_ni.py new file mode 100644 index 000000000..ce0510769 --- /dev/null +++ b/federatedscope/llm/eval/eval_for_rougel/eval_ni.py @@ -0,0 +1,118 @@ +import json +import os +import random + +import numpy as np +import transformers +from tqdm import tqdm +from rouge import Rouge + +from federatedscope.core.configs.config import global_cfg +from federatedscope.core.cmd_args import parse_args, parse_client_cfg +from federatedscope.core.auxiliaries.utils import setup_seed +from federatedscope.core.auxiliaries.logging import update_logger +from federatedscope.core.data.utils import download_url +from federatedscope.llm.misc.fschat import FSChatBot + +transformers.logging.set_verbosity(40) + +DEBUG = False + +rouge = Rouge() + + +def rouge_score(hyps, refs): + try: + rouge_score = rouge.get_scores(hyps, refs)[0]['rouge-l']['f'] + except ValueError: + return 0.0 + return rouge_score + + +def load_data(file_path, + instruction='instruction', + input='input', + output='output', + category='category'): + + # Format: [{'instruction': ..., 'input': ..., 'output':...}] + with open(file_path, 'r', encoding="utf-8") as f: + list_data_dict = json.load(f) + + # Replace key + new_list_data_dict = [] + list_data_dict = list_data_dict["Instances"] + + num_samples = int(len(list_data_dict) * 0.02) + chosen_list_data_dict = random.sample(list_data_dict, num_samples) + + for item in chosen_list_data_dict: + new_item = dict( + instruction=item[instruction] if instruction in item else None, + input=item[input] if input in item else None, + output=item[output][0] if output in item else None, + category=item[category] if category in item else None) + new_list_data_dict.append(new_item) + return new_list_data_dict + + +def main(): + init_cfg = global_cfg.clone() + args = parse_args() + + if args.cfg_file: + init_cfg.merge_from_file(args.cfg_file) + cfg_opt, client_cfg_opt = parse_client_cfg(args.opts) + init_cfg.merge_from_list(cfg_opt) + + update_logger(init_cfg, clear_before_add=True) + setup_seed(init_cfg.seed) + + # load your finetuned model (saved as xxx.ckpt) + # in yaml file federate.save_to + fschatbot = FSChatBot(init_cfg) + + test_tasks_fp = os.path.join( + init_cfg.data.root, + "natural-instructions-2.8/splits/xlingual/test_tasks.txt") + + if not os.path.exists(test_tasks_fp): + download_url( + 'https://github.com/allenai/natural-instructions/archive/refs' + '/tags/v2.8.zip', init_cfg.data.root) + print("Please unzip the data, and rerun.") + return + + test_tasks = [] + with open(test_tasks_fp, 'r') as f: + while True: + line = f.readline() + if not line: + break + test_tasks.append(line.strip()) + + list_data_dict = [] + for task in test_tasks: + fp = os.path.join(init_cfg.data.root, "natural-instructions-2.8/tasks", + task + ".json") + list_data_dict.extend(load_data(fp)) + + answers = [] + for sample in tqdm(list_data_dict): + input_text = sample['input'] + generate_kwargs = dict(max_new_tokens=256, top_p=0.95, temperature=0.8) + model_answer = fschatbot.generate(input_text, generate_kwargs) + + rougel_cor = rouge_score(model_answer, sample['output']) + answers.append(rougel_cor) + if DEBUG: + print(f'Full input_text:\n{input_text}\n\n') + print(f'Question: {sample["input"]}\n\n' + f'Answers: {model_answer}\n\n') + + print(f'Num of total question: {len(answers)}, ' + f'Average score: {np.average(answers)}.') + + +if __name__ == "__main__": + main() From 38757f815afbccc8900d63d3fb66e6a0e0e6f32a Mon Sep 17 00:00:00 2001 From: qbc Date: Thu, 11 Apr 2024 12:09:01 +0800 Subject: [PATCH 4/4] add readme --- .../llm/eval/eval_for_rougel/README.md | 20 +++++++++++-------- .../{eval.py => eval_dolly.py} | 0 2 files changed, 12 insertions(+), 8 deletions(-) rename federatedscope/llm/eval/eval_for_rougel/{eval.py => eval_dolly.py} (100%) diff --git a/federatedscope/llm/eval/eval_for_rougel/README.md b/federatedscope/llm/eval/eval_for_rougel/README.md index 305d728b0..d3da1a99c 100644 --- a/federatedscope/llm/eval/eval_for_rougel/README.md +++ b/federatedscope/llm/eval/eval_for_rougel/README.md @@ -1,15 +1,19 @@ # Rouge-L +## Dolly-15K To assess the performance of our fine-tuned model, we leverage the Rouge-L -metric and conduct experiments with a large number of clients, utilizing the -Dolly-15K dataset as our training corpus. The Dolly-15K dataset encompasses -a total of 15,015 data points, distributed across eight distinct tasks. For -a more comprehensive evaluation, we allocate the final task exclusively for -evaluation purposes, while dedicating the remaining ones to the training -phase. Our experimental setup involves a network of 200 clients, utilizing a Dirichlet distribution for data partitioning to emulate non-IID conditions across the client base. +metric and conduct experiments with a large number of clients, utilizing the Dolly-15K dataset as our training corpus. +The Dolly-15K dataset encompasses a total of 15,015 data points, distributed across eight distinct tasks. For a more comprehensive evaluation, we allocate the final task exclusively for evaluation purposes, while dedicating the remaining ones to the training phase. Our experimental setup involves a network of 200 clients, utilizing a Dirichlet distribution for data partitioning to emulate non-IID conditions across the client base. To do the evaluation, run ```bash -python federatescope/eval/eval_for_rougel/eval.py --cfg -federatescope/llm/baselime/xxx.yaml +python federatescope/eval/eval_for_rougel/eval_dolly.py --cfg federatescope/llm/baselime/xxx.yaml +``` + +## Natural Instructions +We also leverage the Rouge-L metric and conduct experiments with a large number of clients, utilizing the Natural Instructions (NI) dataset as our training corpus. In the NI dataset, we allocate each of the 738 training tasks exclusively to a distinct client for model training, thereby cultivating a non-IID setting characterized by feature distribution skew. Meanwhile, evaluation is performed on separate test tasks. + +To do the evaluation, run +```bash +python federatescope/eval/eval_for_rougel/eval_ni.py --cfg federatescope/llm/baselime/xxx.yaml ``` \ No newline at end of file diff --git a/federatedscope/llm/eval/eval_for_rougel/eval.py b/federatedscope/llm/eval/eval_for_rougel/eval_dolly.py similarity index 100% rename from federatedscope/llm/eval/eval_for_rougel/eval.py rename to federatedscope/llm/eval/eval_for_rougel/eval_dolly.py