diff --git a/.buildinfo b/.buildinfo index d7833fa4a..f6ef2734a 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: b658939d076bdeff988fb761020ea848 +# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 7a1276b8a4994f80f782c9e1dc560759 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_modules/data_juicer.html b/_modules/data_juicer.html index f984de1f8..14a7f406d 100644 --- a/_modules/data_juicer.html +++ b/_modules/data_juicer.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ diff --git a/_modules/data_juicer/analysis/collector.html b/_modules/data_juicer/analysis/collector.html deleted file mode 100644 index 600463550..000000000 --- a/_modules/data_juicer/analysis/collector.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - - data_juicer.analysis.collector — data_juicer 1.0.3 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.collector

-from itertools import chain
-
-from data_juicer.format import load_formatter
-from data_juicer.utils.lazy_loader import LazyLoader
-
-torch = LazyLoader('torch', 'torch')
-transformers = LazyLoader('transformers', 'transformers')
-
-
-
-[docs] -class TextTokenDistCollector(object): - """Tokenize and collect distribution of tokens for given - dataset with a specified tokenizer. - """ - -
-[docs] - def __init__(self, tokenizer): - """ - Initialization method. - - :param tokenizer: tokenizer name on huggingface - """ - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - tokenizer, trust_remote_code=True) - self.vocab_size = len(self.tokenizer)
- - -
-[docs] - def collect(self, - data_path, - text_key, - num_proc=1) -> 'torch.distributions.Categorical': - """ - Tokenize and collect tokens distribution of input dataset - :param data_path: path to input dataset. - :param text_key: field keys that will be considered into token counts. - :param num_proc: number of processes to count tokens. - :return: token distribution. - """ - - formatter = load_formatter(data_path) - dataset = formatter.load_dataset(num_proc=num_proc) - assert text_key in dataset.features, f'[{text_key} not find in dataset' - - def prepare_tokenizer( - tokenizer, - text_key, - ): - """ - Prepare a tokenizer function for dataset. - :param tokenizer: a tokenizer to tokenize sample. - :param text_key: field keys that will be - considered into token counts. - """ - - def _tokenize_fn(example, ): - example = tokenizer(example[text_key], - add_special_tokens=False) - return example - - return _tokenize_fn - - tokenize_proc = prepare_tokenizer(self.tokenizer, text_key) - dataset = dataset.map(tokenize_proc, - num_proc=num_proc, - desc=f'tokenize {data_path.split("/")[-1]}') - - token_count = torch.zeros(self.vocab_size, dtype=torch.int64) - token_ids = torch.tensor( - list(chain.from_iterable(dataset['input_ids']))) - indices, counts = token_ids.unique(return_counts=True) - token_count.scatter_(0, indices, counts.to(token_count.dtype)) - dist = torch.distributions.Categorical(token_count) - return dist
-
- -
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 7f6ff9d58..6d276ed4e 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ @@ -90,8 +90,6 @@

Source code for data_juicer.analysis.column_wise_analysis

from .overall_analysis import OverallAnalysis -
-[docs] def get_row_col(total_num, factor=2): """ Given the total number of stats figures, get the "best" number of rows and @@ -130,17 +128,16 @@

Source code for data_juicer.analysis.column_wise_analysis

for i in range(total_num): grids.append((i // now_col, i % now_col)) - return int(now_row), int(now_col), grids
- + return int(now_row), int(now_col), grids
-[docs] +[docs] class ColumnWiseAnalysis: """Apply analysis on each column of stats respectively."""
-[docs] +[docs] def __init__(self, dataset, output_path, @@ -176,7 +173,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False): """ Apply analysis and draw the analysis figure for stats. @@ -294,7 +291,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): """ Draw the histogram for the data. @@ -355,7 +352,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): """ Draw the box plot for the data. @@ -406,7 +403,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_wordcloud(self, ax, data, save_path, show=False): word_list = data.tolist() word_nums = {} diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html index 4e60906ef..040686ef9 100644 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ b/_modules/data_juicer/analysis/diversity_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -88,8 +88,6 @@

Source code for data_juicer.analysis.diversity_analysis

# Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb -
-[docs] def find_root_verb_and_its_dobj(tree_root): """ Find the verb and its object closest to the root. @@ -110,14 +108,11 @@

Source code for data_juicer.analysis.diversity_analysis

for child in tree_root.children: return find_root_verb_and_its_dobj(child) # if no children satisfy the condition, return None - return None, None
- + return None, None # Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb -
-[docs] def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True): """ Find the verb and its object closest to the root of lexical tree of input @@ -136,12 +131,9 @@

Source code for data_juicer.analysis.diversity_analysis

verb, noun = find_root_verb_and_its_dobj(sent.root) if first_sent or (verb is not None and noun is not None): return verb, noun - return None, None
- + return None, None -
-[docs] def get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs): """ Given the lexical tree analysis result, return the diversity results. @@ -166,18 +158,17 @@

Source code for data_juicer.analysis.diversity_analysis

df = df.groupby('verb').apply(lambda x: x.sort_values( 'count', ascending=False).head(top_k_nouns)).reset_index(drop=True) - return df
- + return df
-[docs] +[docs] class DiversityAnalysis: """Apply diversity analysis for each sample and get an overall analysis result."""
-[docs] +[docs] def __init__(self, dataset, output_path, lang_or_model='en'): """Initialization method :param dataset: the dataset to be analyzed :param output_path: path to store the analysis results :param @@ -192,7 +183,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def compute(self, lang_or_model=None, column_name='text'): """ Apply lexical tree analysis on each sample. @@ -226,7 +217,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def analyze(self, lang_or_model=None, column_name='text', diff --git a/_modules/data_juicer/analysis/draw.html b/_modules/data_juicer/analysis/draw.html deleted file mode 100644 index 700b99410..000000000 --- a/_modules/data_juicer/analysis/draw.html +++ /dev/null @@ -1,154 +0,0 @@ - - - - - - - - data_juicer.analysis.draw — data_juicer 1.0.3 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.draw

-import matplotlib.pyplot as plt
-import numpy as np
-import seaborn as sns
-
-
-
-[docs] -def draw_heatmap(data, xlabels, ylables=None, figsize=None, triangle=False): - """ - Draw heatmap of input data with special lables. - - :param data: input data, now support - [`list`, `tuple`, `numpy array`, 'torch tensor'] - :param xlabels: x axis labels. - :param ylabels: y axis labels, if None, use xlabels. - :param figsize: figure size. - :param triangle: only display triangle. - :return: a plot figure. - """ - figsize = figsize if figsize else (8 * 2.5, 6 * 2.5) - _, ax = plt.subplots(figsize=figsize) - mask = None - if triangle: - mask = np.triu(np.ones_like(data)) - ax.tick_params( - right=True, - top=True, - labelright=True, - labeltop=True, - ) - sns.heatmap(data, - ax=ax, - cmap='Oranges', - annot=True, - mask=mask, - linewidths=.05, - square=True, - xticklabels=xlabels, - yticklabels=ylables, - annot_kws={'size': 8}) - plt.subplots_adjust(left=.1, right=0.95, bottom=0.22, top=0.95) - fig = plt.gcf() - plt.show() - return fig
- -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/measure.html b/_modules/data_juicer/analysis/measure.html deleted file mode 100644 index c1e319cba..000000000 --- a/_modules/data_juicer/analysis/measure.html +++ /dev/null @@ -1,372 +0,0 @@ - - - - - - - - data_juicer.analysis.measure — data_juicer 1.0.3 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.measure

-import numpy as np
-
-from data_juicer.utils.lazy_loader import LazyLoader
-
-torch = LazyLoader('torch', 'torch')
-td = LazyLoader('td', 'torch.distributions')
-F = LazyLoader('F', 'torch.nn.functional')
-
-stats = LazyLoader('stats', 'scipy.stats')
-
-
-
-[docs] -class Measure(object): - """Base class for Measure distribution. - """ - name = 'base' - -
-[docs] - def measure(self, *args, **kwargs): - pass
- - - def __call__(self, *args, **kwargs): - return self.measure(*args, **kwargs) - - def _convert_to_tensor(self, p): - """ - Convert input data to torch tensor. - :param p: input data, now support - [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. - :return: torch tensor - """ - if isinstance(p, torch.Tensor): - return p - elif isinstance(p, td.Categorical): - return p.probs - elif isinstance(p, str): - return torch.load(p) - else: - return torch.tensor(p) - - def _convert_to_categorical(self, p): - """ - Convert input data to torch Categorical. - :param p: input data, now support - [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. - :return: torch Categorical - """ - if isinstance(p, td.Categorical): - return p - elif isinstance(p, torch.Tensor): - return td.Categorical(p) - elif isinstance(p, str): - return td.Categorical(torch.load(p)) - else: - return td.Categorical(torch.tensor(p)) - - def _convert_to_ndarray(self, p): - """ - Convert input data to torch tensor. - :param p: input data, now support - [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. - :return: torch tensor - """ - return self._convert_to_tensor(p).numpy()
- - - -
-[docs] -class KLDivMeasure(Measure): - """ - Measure Kullback-Leibler divergence. - """ - name = 'kl_divergence' - -
-[docs] - def measure(self, p, q): - p = self._convert_to_categorical(p) - q = self._convert_to_categorical(q) - assert p.probs.shape == q.probs.shape, \ - 'The two inputs have different shape:' \ - f'{p.probs.shape} != {q.probs.shape} in {self.name}' - return F.kl_div(q.logits, p.probs, log_target=False, reduction='sum')
-
- - - -
-[docs] -class JSDivMeasure(Measure): - """ - Measure Jensen-Shannon divergence. - """ - name = 'js_divergence' - -
-[docs] - def measure(self, p, q): - p = self._convert_to_tensor(p) - q = self._convert_to_tensor(q) - assert p.shape == q.shape, \ - 'The two inputs have different shape:' \ - f'{p.shape} != {q.shape} in {self.name}' - - m = 0.5 * (p + q) - kl_p = KLDivMeasure()(p, m) - kl_q = KLDivMeasure()(q, m) - js = 0.5 * (kl_p + kl_q) - return js
-
- - - -
-[docs] -class CrossEntropyMeasure(Measure): - """ - Measure Cross-Entropy. - """ - name = 'cross_entropy' - -
-[docs] - def measure(self, p, q): - p = self._convert_to_categorical(p) - q = self._convert_to_categorical(q) - assert p.probs.shape == q.probs.shape, \ - 'The two inputs have different shape: '\ - f'{p.probs.shape} != {q.probs.shape} in {self.name}' - return F.cross_entropy(q.logits, p.probs, reduction='sum')
-
- - - -
-[docs] -class EntropyMeasure(Measure): - """ - Measure Entropy. - """ - name = 'entropy' - -
-[docs] - def measure(self, p): - p = self._convert_to_categorical(p) - return p.entropy()
-
- - - -
-[docs] -class RelatedTTestMeasure(Measure): - """ - Measure T-Test for two related distributions on their histogram of the same - bins. - - Ref: - https://en.wikipedia.org/wiki/Student%27s_t-test - - For continuous features or distributions, the input could be dataset stats - list. - For discrete features or distributions, the input could be the tags or the - categories list. - """ - name = 't-test' - -
-[docs] - @staticmethod - def stats_to_hist(p, q): - p = np.array(p) - q = np.array(q) - - # get common maximum number of data samples, and max/min values - max_data_num = max(len(p), len(q)) - min_val = min(min(p), min(q)) - max_val = max(max(p), max(q)) - - # get a recommended number of bins - rec_bins = max(int(np.sqrt(max_data_num)), 10) - - # get the common bin edges - common_p = np.append(p, [min_val, max_val]) - hist_p, bin_edges = np.histogram(common_p, bins=rec_bins) - # restore the hist of the original p - hist_p[0] -= 1 - hist_p[-1] -= 1 - # get the hist of the original q using the common bin edges - hist_q, _ = np.histogram(q, bins=bin_edges) - return hist_p, hist_q, bin_edges
- - -
-[docs] - @staticmethod - def category_to_hist(p, q): - - def flatten_list(lst): - res = [] - for s in lst: - if isinstance(s, list): - res.extend(flatten_list(s)) - else: - res.append(s) - return res - - # flatten the list - p = flatten_list(p) - q = flatten_list(q) - - # get the common categories - cat_p = set(p) - cat_q = set(q) - cat_common = cat_p.union(cat_q) - - # get category distributions - count_p = {cat: 0 for cat in cat_common} - count_q = {cat: 0 for cat in cat_common} - for cat in p: - count_p[cat] += 1 - for cat in q: - count_q[cat] += 1 - - # only keep distribution values sorted by counts - sorted_cat = list(count_p.items()) - sorted_cat.sort(key=lambda it: it[1], reverse=True) - sorted_cat = [it[0] for it in sorted_cat] - # get the value dist - hist_p = [count_p[cat] for cat in sorted_cat] - hist_q = [count_q[cat] for cat in sorted_cat] - - return hist_p, hist_q, count_p, count_q, sorted_cat
- - -
-[docs] - def measure(self, p, q): - """ - :param p: the first feature or distribution. (stats/tags/categories) - :param q: the second feature or distribution. (stats/tags/categories) - :return: the T-Test results object -- ([ref](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats._result_classes.TtestResult.html#scipy.stats._result_classes.TtestResult)) # noqa: E501 - """ - ele = p[0] - while isinstance(ele, list): - ele = ele[0] - if isinstance(ele, str): - # discrete tags or categories - hist_p, hist_q = self.category_to_hist(p, q)[:2] - else: - # continuous stats - hist_p, hist_q = self.stats_to_hist(p, q)[:2] - - # compute the t-test and pval for hist_p and hist_q - ttest_res = stats.ttest_rel(hist_p, hist_q) - return ttest_res
-
- -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html index 0c47c8f83..f8db011c0 100644 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ b/_modules/data_juicer/analysis/overall_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -93,13 +93,13 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] class OverallAnalysis: """Apply analysis on the overall stats, including mean, std, quantiles, etc."""
-[docs] +[docs] def __init__(self, dataset, output_path): """ Initialization method. @@ -129,7 +129,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def refine_single_column(self, col): if col.dtype != 'object': # not an object, return directly @@ -152,7 +152,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False): """ Apply overall analysis on the whole dataset based on the describe diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html index e72d0751b..f98779731 100644 --- a/_modules/data_juicer/config/config.html +++ b/_modules/data_juicer/config/config.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -103,7 +103,7 @@

Source code for data_juicer.config.config

 
 
 
-[docs] +[docs] def init_configs(args: Optional[List[str]] = None, which_entry: object = None): """ initialize the jsonargparse parser and parse configs from one of: @@ -481,8 +481,6 @@

Source code for data_juicer.config.config

 
 
 
-