diff --git a/.buildinfo b/.buildinfo index f6ef2734a..d7833fa4a 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 -# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 7a1276b8a4994f80f782c9e1dc560759 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: b658939d076bdeff988fb761020ea848 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_modules/data_juicer.html b/_modules/data_juicer.html index 14a7f406d..f984de1f8 100644 --- a/_modules/data_juicer.html +++ b/_modules/data_juicer.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ diff --git a/_modules/data_juicer/analysis/collector.html b/_modules/data_juicer/analysis/collector.html new file mode 100644 index 000000000..600463550 --- /dev/null +++ b/_modules/data_juicer/analysis/collector.html @@ -0,0 +1,188 @@ + + + + + + + + data_juicer.analysis.collector — data_juicer 1.0.3 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.analysis.collector

+from itertools import chain
+
+from data_juicer.format import load_formatter
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader('torch', 'torch')
+transformers = LazyLoader('transformers', 'transformers')
+
+
+
+[docs] +class TextTokenDistCollector(object): + """Tokenize and collect distribution of tokens for given + dataset with a specified tokenizer. + """ + +
+[docs] + def __init__(self, tokenizer): + """ + Initialization method. + + :param tokenizer: tokenizer name on huggingface + """ + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer, trust_remote_code=True) + self.vocab_size = len(self.tokenizer)
+ + +
+[docs] + def collect(self, + data_path, + text_key, + num_proc=1) -> 'torch.distributions.Categorical': + """ + Tokenize and collect tokens distribution of input dataset + :param data_path: path to input dataset. + :param text_key: field keys that will be considered into token counts. + :param num_proc: number of processes to count tokens. + :return: token distribution. + """ + + formatter = load_formatter(data_path) + dataset = formatter.load_dataset(num_proc=num_proc) + assert text_key in dataset.features, f'[{text_key} not find in dataset' + + def prepare_tokenizer( + tokenizer, + text_key, + ): + """ + Prepare a tokenizer function for dataset. + :param tokenizer: a tokenizer to tokenize sample. + :param text_key: field keys that will be + considered into token counts. + """ + + def _tokenize_fn(example, ): + example = tokenizer(example[text_key], + add_special_tokens=False) + return example + + return _tokenize_fn + + tokenize_proc = prepare_tokenizer(self.tokenizer, text_key) + dataset = dataset.map(tokenize_proc, + num_proc=num_proc, + desc=f'tokenize {data_path.split("/")[-1]}') + + token_count = torch.zeros(self.vocab_size, dtype=torch.int64) + token_ids = torch.tensor( + list(chain.from_iterable(dataset['input_ids']))) + indices, counts = token_ids.unique(return_counts=True) + token_count.scatter_(0, indices, counts.to(token_count.dtype)) + dist = torch.distributions.Categorical(token_count) + return dist
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 6d276ed4e..7f6ff9d58 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ @@ -90,6 +90,8 @@

Source code for data_juicer.analysis.column_wise_analysis

from .overall_analysis import OverallAnalysis +
+[docs] def get_row_col(total_num, factor=2): """ Given the total number of stats figures, get the "best" number of rows and @@ -128,16 +130,17 @@

Source code for data_juicer.analysis.column_wise_analysis

for i in range(total_num): grids.append((i // now_col, i % now_col)) - return int(now_row), int(now_col), grids + return int(now_row), int(now_col), grids
+
-[docs] +[docs] class ColumnWiseAnalysis: """Apply analysis on each column of stats respectively."""
-[docs] +[docs] def __init__(self, dataset, output_path, @@ -173,7 +176,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False): """ Apply analysis and draw the analysis figure for stats. @@ -291,7 +294,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): """ Draw the histogram for the data. @@ -352,7 +355,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): """ Draw the box plot for the data. @@ -403,7 +406,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_wordcloud(self, ax, data, save_path, show=False): word_list = data.tolist() word_nums = {} diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html index 040686ef9..4e60906ef 100644 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ b/_modules/data_juicer/analysis/diversity_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -88,6 +88,8 @@

Source code for data_juicer.analysis.diversity_analysis

# Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb +
+[docs] def find_root_verb_and_its_dobj(tree_root): """ Find the verb and its object closest to the root. @@ -108,11 +110,14 @@

Source code for data_juicer.analysis.diversity_analysis

for child in tree_root.children: return find_root_verb_and_its_dobj(child) # if no children satisfy the condition, return None - return None, None + return None, None
+ # Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb +
+[docs] def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True): """ Find the verb and its object closest to the root of lexical tree of input @@ -131,9 +136,12 @@

Source code for data_juicer.analysis.diversity_analysis

verb, noun = find_root_verb_and_its_dobj(sent.root) if first_sent or (verb is not None and noun is not None): return verb, noun - return None, None + return None, None
+ +
+[docs] def get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs): """ Given the lexical tree analysis result, return the diversity results. @@ -158,17 +166,18 @@

Source code for data_juicer.analysis.diversity_analysis

df = df.groupby('verb').apply(lambda x: x.sort_values( 'count', ascending=False).head(top_k_nouns)).reset_index(drop=True) - return df + return df
+
-[docs] +[docs] class DiversityAnalysis: """Apply diversity analysis for each sample and get an overall analysis result."""
-[docs] +[docs] def __init__(self, dataset, output_path, lang_or_model='en'): """Initialization method :param dataset: the dataset to be analyzed :param output_path: path to store the analysis results :param @@ -183,7 +192,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def compute(self, lang_or_model=None, column_name='text'): """ Apply lexical tree analysis on each sample. @@ -217,7 +226,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def analyze(self, lang_or_model=None, column_name='text', diff --git a/_modules/data_juicer/analysis/draw.html b/_modules/data_juicer/analysis/draw.html new file mode 100644 index 000000000..700b99410 --- /dev/null +++ b/_modules/data_juicer/analysis/draw.html @@ -0,0 +1,154 @@ + + + + + + + + data_juicer.analysis.draw — data_juicer 1.0.3 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.analysis.draw

+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+
+
+[docs] +def draw_heatmap(data, xlabels, ylables=None, figsize=None, triangle=False): + """ + Draw heatmap of input data with special lables. + + :param data: input data, now support + [`list`, `tuple`, `numpy array`, 'torch tensor'] + :param xlabels: x axis labels. + :param ylabels: y axis labels, if None, use xlabels. + :param figsize: figure size. + :param triangle: only display triangle. + :return: a plot figure. + """ + figsize = figsize if figsize else (8 * 2.5, 6 * 2.5) + _, ax = plt.subplots(figsize=figsize) + mask = None + if triangle: + mask = np.triu(np.ones_like(data)) + ax.tick_params( + right=True, + top=True, + labelright=True, + labeltop=True, + ) + sns.heatmap(data, + ax=ax, + cmap='Oranges', + annot=True, + mask=mask, + linewidths=.05, + square=True, + xticklabels=xlabels, + yticklabels=ylables, + annot_kws={'size': 8}) + plt.subplots_adjust(left=.1, right=0.95, bottom=0.22, top=0.95) + fig = plt.gcf() + plt.show() + return fig
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/analysis/measure.html b/_modules/data_juicer/analysis/measure.html new file mode 100644 index 000000000..c1e319cba --- /dev/null +++ b/_modules/data_juicer/analysis/measure.html @@ -0,0 +1,372 @@ + + + + + + + + data_juicer.analysis.measure — data_juicer 1.0.3 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.analysis.measure

+import numpy as np
+
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader('torch', 'torch')
+td = LazyLoader('td', 'torch.distributions')
+F = LazyLoader('F', 'torch.nn.functional')
+
+stats = LazyLoader('stats', 'scipy.stats')
+
+
+
+[docs] +class Measure(object): + """Base class for Measure distribution. + """ + name = 'base' + +
+[docs] + def measure(self, *args, **kwargs): + pass
+ + + def __call__(self, *args, **kwargs): + return self.measure(*args, **kwargs) + + def _convert_to_tensor(self, p): + """ + Convert input data to torch tensor. + :param p: input data, now support + [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. + :return: torch tensor + """ + if isinstance(p, torch.Tensor): + return p + elif isinstance(p, td.Categorical): + return p.probs + elif isinstance(p, str): + return torch.load(p) + else: + return torch.tensor(p) + + def _convert_to_categorical(self, p): + """ + Convert input data to torch Categorical. + :param p: input data, now support + [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. + :return: torch Categorical + """ + if isinstance(p, td.Categorical): + return p + elif isinstance(p, torch.Tensor): + return td.Categorical(p) + elif isinstance(p, str): + return td.Categorical(torch.load(p)) + else: + return td.Categorical(torch.tensor(p)) + + def _convert_to_ndarray(self, p): + """ + Convert input data to torch tensor. + :param p: input data, now support + [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. + :return: torch tensor + """ + return self._convert_to_tensor(p).numpy()
+ + + +
+[docs] +class KLDivMeasure(Measure): + """ + Measure Kullback-Leibler divergence. + """ + name = 'kl_divergence' + +
+[docs] + def measure(self, p, q): + p = self._convert_to_categorical(p) + q = self._convert_to_categorical(q) + assert p.probs.shape == q.probs.shape, \ + 'The two inputs have different shape:' \ + f'{p.probs.shape} != {q.probs.shape} in {self.name}' + return F.kl_div(q.logits, p.probs, log_target=False, reduction='sum')
+
+ + + +
+[docs] +class JSDivMeasure(Measure): + """ + Measure Jensen-Shannon divergence. + """ + name = 'js_divergence' + +
+[docs] + def measure(self, p, q): + p = self._convert_to_tensor(p) + q = self._convert_to_tensor(q) + assert p.shape == q.shape, \ + 'The two inputs have different shape:' \ + f'{p.shape} != {q.shape} in {self.name}' + + m = 0.5 * (p + q) + kl_p = KLDivMeasure()(p, m) + kl_q = KLDivMeasure()(q, m) + js = 0.5 * (kl_p + kl_q) + return js
+
+ + + +
+[docs] +class CrossEntropyMeasure(Measure): + """ + Measure Cross-Entropy. + """ + name = 'cross_entropy' + +
+[docs] + def measure(self, p, q): + p = self._convert_to_categorical(p) + q = self._convert_to_categorical(q) + assert p.probs.shape == q.probs.shape, \ + 'The two inputs have different shape: '\ + f'{p.probs.shape} != {q.probs.shape} in {self.name}' + return F.cross_entropy(q.logits, p.probs, reduction='sum')
+
+ + + +
+[docs] +class EntropyMeasure(Measure): + """ + Measure Entropy. + """ + name = 'entropy' + +
+[docs] + def measure(self, p): + p = self._convert_to_categorical(p) + return p.entropy()
+
+ + + +
+[docs] +class RelatedTTestMeasure(Measure): + """ + Measure T-Test for two related distributions on their histogram of the same + bins. + + Ref: + https://en.wikipedia.org/wiki/Student%27s_t-test + + For continuous features or distributions, the input could be dataset stats + list. + For discrete features or distributions, the input could be the tags or the + categories list. + """ + name = 't-test' + +
+[docs] + @staticmethod + def stats_to_hist(p, q): + p = np.array(p) + q = np.array(q) + + # get common maximum number of data samples, and max/min values + max_data_num = max(len(p), len(q)) + min_val = min(min(p), min(q)) + max_val = max(max(p), max(q)) + + # get a recommended number of bins + rec_bins = max(int(np.sqrt(max_data_num)), 10) + + # get the common bin edges + common_p = np.append(p, [min_val, max_val]) + hist_p, bin_edges = np.histogram(common_p, bins=rec_bins) + # restore the hist of the original p + hist_p[0] -= 1 + hist_p[-1] -= 1 + # get the hist of the original q using the common bin edges + hist_q, _ = np.histogram(q, bins=bin_edges) + return hist_p, hist_q, bin_edges
+ + +
+[docs] + @staticmethod + def category_to_hist(p, q): + + def flatten_list(lst): + res = [] + for s in lst: + if isinstance(s, list): + res.extend(flatten_list(s)) + else: + res.append(s) + return res + + # flatten the list + p = flatten_list(p) + q = flatten_list(q) + + # get the common categories + cat_p = set(p) + cat_q = set(q) + cat_common = cat_p.union(cat_q) + + # get category distributions + count_p = {cat: 0 for cat in cat_common} + count_q = {cat: 0 for cat in cat_common} + for cat in p: + count_p[cat] += 1 + for cat in q: + count_q[cat] += 1 + + # only keep distribution values sorted by counts + sorted_cat = list(count_p.items()) + sorted_cat.sort(key=lambda it: it[1], reverse=True) + sorted_cat = [it[0] for it in sorted_cat] + # get the value dist + hist_p = [count_p[cat] for cat in sorted_cat] + hist_q = [count_q[cat] for cat in sorted_cat] + + return hist_p, hist_q, count_p, count_q, sorted_cat
+ + +
+[docs] + def measure(self, p, q): + """ + :param p: the first feature or distribution. (stats/tags/categories) + :param q: the second feature or distribution. (stats/tags/categories) + :return: the T-Test results object -- ([ref](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats._result_classes.TtestResult.html#scipy.stats._result_classes.TtestResult)) # noqa: E501 + """ + ele = p[0] + while isinstance(ele, list): + ele = ele[0] + if isinstance(ele, str): + # discrete tags or categories + hist_p, hist_q = self.category_to_hist(p, q)[:2] + else: + # continuous stats + hist_p, hist_q = self.stats_to_hist(p, q)[:2] + + # compute the t-test and pval for hist_p and hist_q + ttest_res = stats.ttest_rel(hist_p, hist_q) + return ttest_res
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html index f8db011c0..0c47c8f83 100644 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ b/_modules/data_juicer/analysis/overall_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -93,13 +93,13 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] class OverallAnalysis: """Apply analysis on the overall stats, including mean, std, quantiles, etc."""
-[docs] +[docs] def __init__(self, dataset, output_path): """ Initialization method. @@ -129,7 +129,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def refine_single_column(self, col): if col.dtype != 'object': # not an object, return directly @@ -152,7 +152,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False): """ Apply overall analysis on the whole dataset based on the describe diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html index f98779731..e72d0751b 100644 --- a/_modules/data_juicer/config/config.html +++ b/_modules/data_juicer/config/config.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -103,7 +103,7 @@

Source code for data_juicer.config.config

 
 
 
-[docs] +[docs] def init_configs(args: Optional[List[str]] = None, which_entry: object = None): """ initialize the jsonargparse parser and parse configs from one of: @@ -481,6 +481,8 @@

Source code for data_juicer.config.config

 
 
 
+