diff --git a/scripts/builtin/ampute.dml b/scripts/builtin/ampute.dml index 0da6af20fdb..ffc645b5b35 100644 --- a/scripts/builtin/ampute.dml +++ b/scripts/builtin/ampute.dml @@ -30,7 +30,6 @@ # mech a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified # weights a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR" # seed a manually defined seed for reproducible RNG - # ------------------------------------------------------------------------------------- # # OUTPUT: diff --git a/scripts/builtin/confusionMatrix.dml b/scripts/builtin/confusionMatrix.dml index 3ac70fb3f87..b21088f2cfa 100644 --- a/scripts/builtin/confusionMatrix.dml +++ b/scripts/builtin/confusionMatrix.dml @@ -23,7 +23,7 @@ # and actual labels. We return both the counts and relative frequency # (normalized by sum of true labels) # -# .. code-block:: +# .. code-block:: text # # True Labels # 1 2 diff --git a/scripts/builtin/cooccurrenceMatrix.dml b/scripts/builtin/cooccurrenceMatrix.dml index 86b8b9ca169..590f4ba1e00 100644 --- a/scripts/builtin/cooccurrenceMatrix.dml +++ b/scripts/builtin/cooccurrenceMatrix.dml @@ -18,22 +18,21 @@ # under the License. # #------------------------------------------------------------- -# -# The implementation is based on + +# Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting. +# Adds an index column to the result. The implementation is based on # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c # -#------------------------------------------------------------- - -## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting. -## Adds an index column to the result. # INPUT: # ------------------------------------------------------------------------------ # S (Frame[Unknown]): 1D input data frame containing text data. # ------------------------------------------------------------------------------ +# # OUTPUT: # ------------------------------------------------------------------------------ # result (Frame[Unknown]): Processed text data with an index column. # ------------------------------------------------------------------------------ + processText = function(Frame[Unknown] S) return (Frame[Unknown] result){ print("processText"); tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")"); @@ -172,4 +171,4 @@ f_cooccurrenceMatrix = function( [wordPosition, docID] = getWordPosition(processedResult, maxTokens); [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition); coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize); -} +} \ No newline at end of file diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml index 69bf12af90c..94c292d8554 100644 --- a/scripts/builtin/decisionTree.dml +++ b/scripts/builtin/decisionTree.dml @@ -30,9 +30,9 @@ # and the following trees, M would look as follows: # # (L1) |d<5| -# / \ +# / \\ # (L2) P1:2 |a<7| -# / \ +# / \\ # (L3) P2:2 P3:1 # # --> M := diff --git a/scripts/builtin/dedup.dml b/scripts/builtin/dedup.dml index 1ec2e29c395..af2ecafcdcd 100644 --- a/scripts/builtin/dedup.dml +++ b/scripts/builtin/dedup.dml @@ -28,11 +28,11 @@ # # INPUT: # -------------------------------------------------------------------------------------- -# X Input Frame[String] with n rows and d columns (raw tuples) -# gloveMatrix Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion -# vocab Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix) -# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean" -# threshold (optional) Double: threshold value above which tuples are considered duplicates +# X Input Frame[String] with n rows and d columns (raw tuples) +# gloveMatrix Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion +# vocab Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix) +# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean" +# threshold (optional) Double: threshold value above which tuples are considered duplicates # -------------------------------------------------------------------------------------- # # OUTPUT: diff --git a/scripts/builtin/differenceStatistics.dml b/scripts/builtin/differenceStatistics.dml index 0e9019f0963..30f207091e4 100644 --- a/scripts/builtin/differenceStatistics.dml +++ b/scripts/builtin/differenceStatistics.dml @@ -28,6 +28,11 @@ # X First Matrix to compare # Y Second Matrix to compare # -------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------- +# stats. Difference statistics +# ------------------------------------------------------------------------------------- m_differenceStatistics = function(Matrix[Double] X, Matrix[Double] Y) { diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml index fc5ee9bafb3..9acf52975c6 100644 --- a/scripts/builtin/glove.dml +++ b/scripts/builtin/glove.dml @@ -18,6 +18,51 @@ # under the License. #------------------------------------------------------------- + +# Computes the vector embeddings for words in a large text corpus. +# +# INPUT: +# -------------------------------------------------------------------------------- +# input 1DInput corpus in CSV format. +# seed Random seed for reproducibility. +# vector_size Dimensionality of word vectors, V. +# eta Learning rate for optimization, recommended value: 0.05. +# alpha Weighting function parameter, recommended value: 0.75. +# x_max Maximum co-occurrence value as per the GloVe paper: 100. +# tol Tolerance value to avoid overfitting, recommended value: 1e-4. +# iterations Total number of training iterations. +# print_loss_it Interval (in iterations) for printing the loss. +# maxTokens Maximum number of tokens per text entry. +# windowSize Context window size. +# distanceWeighting Whether to apply distance-based weighting. +# symmetric Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). +# ------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------ +# G The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) +# ------------------------------------------------------------------------------ + + +f_glove = function( + Frame[Unknown] input, + int seed, int vector_size, + double alpha, double eta, + double x_max, + double tol, + int iterations, + int print_loss_it, + Int maxTokens, + Int windowSize, + Boolean distanceWeighting, + Boolean symmetric) + return (frame[Unknown] G){ + + [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric); + G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); +} + + init = function(matrix[double] cooc_matrix, double x_max, double alpha) return(matrix[double] weights, matrix[double] log_cooc_matrix){ E = 2.718281828; @@ -118,45 +163,3 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error); G = cbind(cooc_index[,2], as.frame(G)); } - -glove = function( - Frame[Unknown] input, - int seed, int vector_size, - double alpha, double eta, - double x_max, - double tol, - int iterations, - int print_loss_it, - Int maxTokens, - Int windowSize, - Boolean distanceWeighting, - Boolean symmetric) - return (frame[Unknown] G){ - - /* - * Main function to Computes the vector embeddings for words in a large text corpus. - * INPUT: - * ------------------------------------------------------------------------------ - * - input (Frame[Unknown]): 1DInput corpus in CSV format. - * - seed: Random seed for reproducibility. - * - vector_size: Dimensionality of word vectors, V. - * - eta: Learning rate for optimization, recommended value: 0.05. - * - alpha: Weighting function parameter, recommended value: 0.75. - * - x_max: Maximum co-occurrence value as per the GloVe paper: 100. - * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4. - * - iterations: Total number of training iterations. - * - print_loss_it: Interval (in iterations) for printing the loss. - * - maxTokens (Int): Maximum number of tokens per text entry. - * - windowSize (Int): Context window size. - * - distanceWeighting (Boolean): Whether to apply distance-based weighting. - * - symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). - * ------------------------------------------------------------------------------ - * OUTPUT: - * ------------------------------------------------------------------------------ - * G (Frame[Unknown]): The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) - * ------------------------------------------------------------------------------ - */ - - [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric); - G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); -} diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml index 13136ff2c9a..edd8e7727d2 100644 --- a/scripts/builtin/imputeByKNN.dml +++ b/scripts/builtin/imputeByKNN.dml @@ -25,23 +25,16 @@ # the missing values by column means. Currently, only the column with the most # missing values is actually imputed. # -# ------------------------------------------------------------------------------ # INPUT: # ------------------------------------------------------------------------------ -# X Matrix with missing values, which are represented as NaNs -# method Method used for imputing missing values with different performance -# and accuracy tradeoffs: -# 'dist' (default): Compute all-pairs distances and impute the -# missing values by closest. O(N^2 * #features) -# 'dist_missing': Compute distances between data and records with -# missing values. O(N*M * #features), assuming -# that the number of records with MV is M< M := # [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) # [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) diff --git a/scripts/builtin/shapExplainer.dml b/scripts/builtin/shapExplainer.dml index b78a5dbcefb..39d365bf013 100644 --- a/scripts/builtin/shapExplainer.dml +++ b/scripts/builtin/shapExplainer.dml @@ -51,6 +51,7 @@ # S Matrix holding the shapley values along the cols, one row per instance. # expected Double holding the average prediction of all instances. # ----------------------------------------------------------------------------- + s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances, Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0, Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0) diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index 6f946c7729c..c9987320928 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -19,8 +19,44 @@ # #------------------------------------------------------------- -# This function cleans top-K item (where K is given as input)for a given list of users. +# This function cleans top-K item (where K is given as input) for a given list of users. # metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask +# +# INPUT: +# ------------------------------------------------------------------------------ +# dataTrain Training set +# dataTest Test set ignored when cv is set to True +# metaData 3×n frame with schema, categorical mask, and FD mask for dataTrain +# primitives Library of primitive cleaning operators +# parameters Hyperparameter search space that matches the primitives +# refSol Reference solution +# evaluationFunc Name of a SystemDS DML function that scores a pipeline +# evalFunHp Hyperparameter matrix for the above evaluation function +# topK Number of best pipelines to return +# resource_val Maximum resource R for the Bandit search +# max_iter Maximum iterations while enumerating logical pipelines +# lq Lower quantile used by utils::doErrorSample when triggered +# uq Upper quantile used by utils::doErrorSample when triggered +# sample Fraction of rows to subsample from dataTrain +# expectedIncrease Minimum improvement over dirtyScore that a candidate must deliver +# seed Seed number +# cv TRUE means k-fold CV, FALSE means hold-out split +# cvk Number of folds if cv = TRUE +# isLastLabel TRUE if the last column is the label +# rowCount Row-count threshold above which doErrorSample may replace uniform sampling +# correctTypos Run spelling correction in the string preprocessing step +# enablePruning Enable pruning inside the Bandit phase +# ------------------------------------------------------------------------------ +# +# OUTPUT: +#------------------------------------------------------------------------------- +# topKPipelines K cleaned-data pipelines +# topKHyperParams Hyperparameter matrix with rows aligning with topKPipelines +# topKScores Evaluation scores with rows aligning with topKPipelines +# dirtyScore Baseline score on the unclean data +# evalFunHp Updated evaluation function hyperparameters +# applyFunc Frame of “apply” functions for deploying each of the top-K pipelines +#------------------------------------------------------------------------------- source("scripts/pipelines/scripts/utils.dml") as utils; source("scripts/pipelines/scripts/enumerateLogical.dml") as lg; diff --git a/src/main/python/docs/README.md b/src/main/python/docs/README.md index 61bdd24a3e9..e5bc5c59583 100644 --- a/src/main/python/docs/README.md +++ b/src/main/python/docs/README.md @@ -39,4 +39,4 @@ and then run `make html`: make html ``` -The docs will then be created at: `/src/main/python/build`in HTML will be placed in the `./_build` directory. +The docs will then be created at: `/src/main/python/docs/build/html/`. \ No newline at end of file diff --git a/src/main/python/docs/requires-docs.txt b/src/main/python/docs/requires-docs.txt index 9305d9320fa..1022b652401 100644 --- a/src/main/python/docs/requires-docs.txt +++ b/src/main/python/docs/requires-docs.txt @@ -24,4 +24,5 @@ sphinx_rtd_theme numpy py4j scipy -requests \ No newline at end of file +requests +pandas \ No newline at end of file diff --git a/src/main/python/generator/dml_parser.py b/src/main/python/generator/dml_parser.py index 2abffb021f6..8e835e96a12 100644 --- a/src/main/python/generator/dml_parser.py +++ b/src/main/python/generator/dml_parser.py @@ -28,7 +28,7 @@ class FunctionParser(object): header_input_pattern = r"^[ \t\n]*[#]+[ \t\n]*input[ \t\n\w:;.,#]*[\s#\-]*[#]+[\w\s\d:,.()\" \t\n\-]*[\s#\-]*$" header_output_pattern = r"[\s#\-]*[#]+[ \t]*(return|output)[ \t\w:;.,#]*[\s#\-]*[#]+[\w\s\d:,.()\" \t\-]*[\s#\-]*$" - function_pattern = r"^[ms]_[\w]+[ \t\n]*=[ \t\n]+function[^#{]*" + function_pattern = r"^[fms]_[\w]+[ \t\n]*=[ \t\n]+function[^#{]*" # parameter_pattern = r"^m_[\w]+[\s]+=[\s]+function[\s]*\([\s]*(?=return)[\s]*\)[\s]*return[\s]*\([\s]*([\w\[\]\s,\d=.\-_]*)[\s]*\)[\s]*" header_parameter_pattern = r"[\s#\-]*[#]+[ \t]*([\w|-]+)[\s]+([\w]+)[\s]+([\w,\d.\"\-]+)[\s]+([\w|\W]+)" divider_pattern = r"[\s#\-]*" @@ -57,15 +57,13 @@ def parse_function(self, path: str): """ file_name = os.path.basename(path) function_name, extension = os.path.splitext(file_name) - # try: - function_definition = self.find_function_definition(path) - # pattern = re.compile( - # self.__class__.parameter_pattern, flags=re.I | re.M) - # match = pattern.match(function_definition) - - # if match: + try: + function_definition = self.find_function_definition(path) + except AttributeError: + print(f"[INFO] Skipping '{function_name}': does not match function name pattern. It is likely an internal function.") + return - func_split = function_definition.split("function")[1].split("return") + func_split = function_definition.split("function", 1)[1].split("return") param_str = self.extract_param_str(func_split[0]) retval_str = None diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py index bd611ee6cc6..e8cb4c04e95 100644 --- a/src/main/python/systemds/operator/algorithm/__init__.py +++ b/src/main/python/systemds/operator/algorithm/__init__.py @@ -31,6 +31,7 @@ from .builtin.alsPredict import alsPredict from .builtin.alsTopkPredict import alsTopkPredict from .builtin.ampute import ampute +from .builtin.apply_pipeline import apply_pipeline from .builtin.arima import arima from .builtin.auc import auc from .builtin.autoencoder_2layer import autoencoder_2layer @@ -38,7 +39,10 @@ from .builtin.bivar import bivar from .builtin.components import components from .builtin.confusionMatrix import confusionMatrix +from .builtin.cooccurrenceMatrix import cooccurrenceMatrix from .builtin.cor import cor +from .builtin.correctTypos import correctTypos +from .builtin.correctTyposApply import correctTyposApply from .builtin.cov import cov from .builtin.cox import cox from .builtin.cspline import cspline @@ -49,16 +53,24 @@ from .builtin.dbscanApply import dbscanApply from .builtin.decisionTree import decisionTree from .builtin.decisionTreePredict import decisionTreePredict +from .builtin.dedup import dedup from .builtin.deepWalk import deepWalk +from .builtin.denialConstraints import denialConstraints from .builtin.differenceStatistics import differenceStatistics from .builtin.discoverFD import discoverFD from .builtin.dist import dist +from .builtin.dmv import dmv +from .builtin.ema import ema from .builtin.executePipeline import executePipeline from .builtin.f1Score import f1Score from .builtin.fdr import fdr from .builtin.ffPredict import ffPredict from .builtin.ffTrain import ffTrain +from .builtin.fit_pipeline import fit_pipeline +from .builtin.fixInvalidLengths import fixInvalidLengths +from .builtin.fixInvalidLengthsApply import fixInvalidLengthsApply from .builtin.flattenQuantile import flattenQuantile +from .builtin.frameSort import frameSort from .builtin.frequencyEncode import frequencyEncode from .builtin.frequencyEncodeApply import frequencyEncodeApply from .builtin.garch import garch @@ -66,6 +78,7 @@ from .builtin.getAccuracy import getAccuracy from .builtin.glm import glm from .builtin.glmPredict import glmPredict +from .builtin.glove import glove from .builtin.gmm import gmm from .builtin.gmmPredict import gmmPredict from .builtin.gnmf import gnmf @@ -97,6 +110,7 @@ from .builtin.impurityMeasures import impurityMeasures from .builtin.imputeByFD import imputeByFD from .builtin.imputeByFDApply import imputeByFDApply +from .builtin.imputeByKNN import imputeByKNN from .builtin.imputeByMean import imputeByMean from .builtin.imputeByMeanApply import imputeByMeanApply from .builtin.imputeByMedian import imputeByMedian @@ -126,6 +140,7 @@ from .builtin.mape import mape from .builtin.matrixProfile import matrixProfile from .builtin.mcc import mcc +from .builtin.mdedup import mdedup from .builtin.mice import mice from .builtin.miceApply import miceApply from .builtin.mse import mse @@ -153,6 +168,7 @@ from .builtin.pnmf import pnmf from .builtin.ppca import ppca from .builtin.psnr import psnr +from .builtin.quantizeByCluster import quantizeByCluster from .builtin.raGroupby import raGroupby from .builtin.raJoin import raJoin from .builtin.raSelection import raSelection @@ -165,6 +181,7 @@ from .builtin.selectByVarThresh import selectByVarThresh from .builtin.ses import ses from .builtin.setdiff import setdiff +from .builtin.shapExplainer import shapExplainer from .builtin.sherlock import sherlock from .builtin.sherlockPredict import sherlockPredict from .builtin.shortestPath import shortestPath @@ -189,10 +206,12 @@ from .builtin.tSNE import tSNE from .builtin.toOneHot import toOneHot from .builtin.tomeklink import tomeklink +from .builtin.topk_cleaning import topk_cleaning from .builtin.underSampling import underSampling from .builtin.union import union from .builtin.univar import univar from .builtin.vectorToCsv import vectorToCsv +from .builtin.wer import wer from .builtin.winsorize import winsorize from .builtin.winsorizeApply import winsorizeApply from .builtin.xdummy1 import xdummy1 @@ -211,6 +230,7 @@ 'alsPredict', 'alsTopkPredict', 'ampute', + 'apply_pipeline', 'arima', 'auc', 'autoencoder_2layer', @@ -218,7 +238,10 @@ 'bivar', 'components', 'confusionMatrix', + 'cooccurrenceMatrix', 'cor', + 'correctTypos', + 'correctTyposApply', 'cov', 'cox', 'cspline', @@ -229,16 +252,24 @@ 'dbscanApply', 'decisionTree', 'decisionTreePredict', + 'dedup', 'deepWalk', + 'denialConstraints', 'differenceStatistics', 'discoverFD', 'dist', + 'dmv', + 'ema', 'executePipeline', 'f1Score', 'fdr', 'ffPredict', 'ffTrain', + 'fit_pipeline', + 'fixInvalidLengths', + 'fixInvalidLengthsApply', 'flattenQuantile', + 'frameSort', 'frequencyEncode', 'frequencyEncodeApply', 'garch', @@ -246,6 +277,7 @@ 'getAccuracy', 'glm', 'glmPredict', + 'glove', 'gmm', 'gmmPredict', 'gnmf', @@ -277,6 +309,7 @@ 'impurityMeasures', 'imputeByFD', 'imputeByFDApply', + 'imputeByKNN', 'imputeByMean', 'imputeByMeanApply', 'imputeByMedian', @@ -306,6 +339,7 @@ 'mape', 'matrixProfile', 'mcc', + 'mdedup', 'mice', 'miceApply', 'mse', @@ -333,6 +367,7 @@ 'pnmf', 'ppca', 'psnr', + 'quantizeByCluster', 'raGroupby', 'raJoin', 'raSelection', @@ -345,6 +380,7 @@ 'selectByVarThresh', 'ses', 'setdiff', + 'shapExplainer', 'sherlock', 'sherlockPredict', 'shortestPath', @@ -369,10 +405,12 @@ 'tSNE', 'toOneHot', 'tomeklink', + 'topk_cleaning', 'underSampling', 'union', 'univar', 'vectorToCsv', + 'wer', 'winsorize', 'winsorizeApply', 'xdummy1', diff --git a/src/main/python/systemds/operator/algorithm/builtin/ampute.py b/src/main/python/systemds/operator/algorithm/builtin/ampute.py index d323000710e..fb3a82a380f 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/ampute.py +++ b/src/main/python/systemds/operator/algorithm/builtin/ampute.py @@ -33,6 +33,16 @@ def ampute(X: Matrix, """ This function injects missing values into a multivariate a given dataset, similarly to the ampute() method in R's MICE package. + + + :param X: a multivariate numeric dataset [shape: n-by-m] + :param prop: a number in the (0, 1] range specifying the proportion of amputed rows across the entire dataset + :param patterns: a pattern matrix of 0's and 1's [shape: k-by-m] where each row corresponds to a pattern. 0 indicates that a variable should have missing values and 1 indicating that a variable should remain complete + :param freq: a vector [length: k] containing the relative frequency with which each pattern in the patterns matrix should occur + :param mech: a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified + :param weights: a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR" + :param seed: a manually defined seed for reproducible RNG + :return: amputed output dataset """ params_dict = {'X': X} diff --git a/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py b/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py index be1100b4127..63ffc3f66b3 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py +++ b/src/main/python/systemds/operator/algorithm/builtin/apply_pipeline.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py b/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py index 81c549b5982..66a01780b0e 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py +++ b/src/main/python/systemds/operator/algorithm/builtin/confusionMatrix.py @@ -35,7 +35,7 @@ def confusionMatrix(P: Matrix, and actual labels. We return both the counts and relative frequency (normalized by sum of true labels) - .. code-block:: + .. code-block:: text True Labels 1 2 diff --git a/src/main/python/systemds/operator/algorithm/builtin/cooccurrenceMatrix.py b/src/main/python/systemds/operator/algorithm/builtin/cooccurrenceMatrix.py new file mode 100644 index 00000000000..6df77d3e7dd --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/cooccurrenceMatrix.py @@ -0,0 +1,58 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/cooccurrenceMatrix.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def cooccurrenceMatrix(input: Frame, + maxTokens: int, + windowSize: int, + distanceWeighting: bool, + symmetric: bool): + """ + Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting. + Adds an index column to the result. The implementation is based on + https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c + + + + :param S: (Frame[Unknown]): 1D input data frame containing text data. + :return: (Frame[Unknown]): Processed text data with an index column. + """ + + params_dict = {'input': input, 'maxTokens': maxTokens, 'windowSize': windowSize, 'distanceWeighting': distanceWeighting, 'symmetric': symmetric} + + vX_0 = Matrix(input.sds_context, '') + vX_1 = Frame(input.sds_context, '') + output_nodes = [vX_0, vX_1, ] + + op = MultiReturn(input.sds_context, 'cooccurrenceMatrix', output_nodes, named_input_nodes=params_dict) + + vX_0._unnamed_input_nodes = [op] + vX_1._unnamed_input_nodes = [op] + + return op diff --git a/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py b/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py index 321a1949f58..64354a9bc3e 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py +++ b/src/main/python/systemds/operator/algorithm/builtin/correctTypos.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py b/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py index 0a2c61a6f40..5da8769509c 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py +++ b/src/main/python/systemds/operator/algorithm/builtin/correctTyposApply.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py index a1a751d0aad..3fe565b8c7e 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py +++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py @@ -44,9 +44,9 @@ def decisionTree(X: Matrix, and the following trees, M would look as follows: (L1) |d<5| - / \ + / \\ (L2) P1:2 |a<7| - / \ + / \\ (L3) P2:2 P3:1 --> M := diff --git a/src/main/python/systemds/operator/algorithm/builtin/dedup.py b/src/main/python/systemds/operator/algorithm/builtin/dedup.py new file mode 100644 index 00000000000..13d5c35a41e --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/dedup.py @@ -0,0 +1,68 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/dedup.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def dedup(X: Frame, + gloveMatrix: Matrix, + vocab: Frame, + **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + Builtin for deduplication using distributed representations (DRs) and + locality-sensitive hashing (LSH) based blocking. + + The function encodes each input tuple as a dense vector using pre-trained GloVe embeddings (simple averaging), + groups semantically similar tuples via LSH into buckets, and compares only those pairs for deduplication. + + + + + :param X: Input Frame[String] with n rows and d columns (raw tuples) + :param gloveMatrix: Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion + :param vocab: Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix) + :param similarityMeasure: (optional) String specifying similarity metric: "cosine", "euclidean" + :param threshold: (optional) Double: threshold value above which tuples are considered duplicates + :return: Frame[String] with deduplicated tuples + (first occurrence of each duplicate group is retained) + :return: Frame[String] with all detected duplicates + (i.e., tuples removed from the input) + """ + + params_dict = {'X': X, 'gloveMatrix': gloveMatrix, 'vocab': vocab} + params_dict.update(kwargs) + + vX_0 = Frame(X.sds_context, '') + vX_1 = Frame(X.sds_context, '') + output_nodes = [vX_0, vX_1, ] + + op = MultiReturn(X.sds_context, 'dedup', output_nodes, named_input_nodes=params_dict) + + vX_0._unnamed_input_nodes = [op] + vX_1._unnamed_input_nodes = [op] + + return op diff --git a/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py b/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py index 347502b848e..5cdec212965 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py +++ b/src/main/python/systemds/operator/algorithm/builtin/denialConstraints.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py b/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py index dfe2218a424..b6597bb6e4b 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py +++ b/src/main/python/systemds/operator/algorithm/builtin/differenceStatistics.py @@ -35,6 +35,11 @@ def differenceStatistics(X: Matrix, they are different. This can be used for instance in comparison of lossy compression techniques, that reduce the fidelity of the data. + + + :param X: First Matrix to compare + :param Y: Second Matrix to compare + :return: Difference statistics """ params_dict = {'X': X, 'Y': Y} diff --git a/src/main/python/systemds/operator/algorithm/builtin/dmv.py b/src/main/python/systemds/operator/algorithm/builtin/dmv.py index deaf3ea8a6b..2955e505e13 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/dmv.py +++ b/src/main/python/systemds/operator/algorithm/builtin/dmv.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/ema.py b/src/main/python/systemds/operator/algorithm/builtin/ema.py index 4e0ccca6bbb..90f9a852d76 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/ema.py +++ b/src/main/python/systemds/operator/algorithm/builtin/ema.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py b/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py index 1fffb46f100..66750fc0711 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py +++ b/src/main/python/systemds/operator/algorithm/builtin/executePipeline.py @@ -28,7 +28,18 @@ from systemds.utils.consts import VALID_INPUT_TYPES -def executePipeline(X: Matrix): +def executePipeline(pipeline: Frame, + Xtrain: Matrix, + Ytrain: Matrix, + Xtest: Matrix, + Ytest: Matrix, + metaList: List, + hyperParameters: Matrix, + flagsCount: int, + verbose: bool, + startInd: int, + endInd: int, + **kwargs: Dict[str, VALID_INPUT_TYPES]): """ This function execute pipeline. @@ -56,17 +67,30 @@ def executePipeline(X: Matrix): :return: --- """ - params_dict = {'X': X} + params_dict = {'pipeline': pipeline, 'Xtrain': Xtrain, 'Ytrain': Ytrain, 'Xtest': Xtest, 'Ytest': Ytest, 'metaList': metaList, 'hyperParameters': hyperParameters, 'flagsCount': flagsCount, 'verbose': verbose, 'startInd': startInd, 'endInd': endInd} + params_dict.update(kwargs) - vX_0 = Matrix(X.sds_context, '') - vX_1 = Matrix(X.sds_context, '') - vX_2 = Matrix(X.sds_context, '') - output_nodes = [vX_0, vX_1, vX_2, ] + vX_0 = Matrix(pipeline.sds_context, '') + vX_1 = Matrix(pipeline.sds_context, '') + vX_2 = Matrix(pipeline.sds_context, '') + vX_3 = Matrix(pipeline.sds_context, '') + vX_4 = Scalar(pipeline.sds_context, '') + vX_5 = Matrix(pipeline.sds_context, '') + vX_6 = Matrix(pipeline.sds_context, '') + vX_7 = Scalar(pipeline.sds_context, '') + vX_8 = List(pipeline.sds_context, '') + output_nodes = [vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, vX_6, vX_7, vX_8, ] - op = MultiReturn(X.sds_context, 'executePipeline', output_nodes, named_input_nodes=params_dict) + op = MultiReturn(pipeline.sds_context, 'executePipeline', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] + vX_3._unnamed_input_nodes = [op] + vX_4._unnamed_input_nodes = [op] + vX_5._unnamed_input_nodes = [op] + vX_6._unnamed_input_nodes = [op] + vX_7._unnamed_input_nodes = [op] + vX_8._unnamed_input_nodes = [op] return op diff --git a/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py b/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py index 5de40c745f8..48363035d8b 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py +++ b/src/main/python/systemds/operator/algorithm/builtin/fit_pipeline.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py index b635f31b298..cc0e83a51e4 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py +++ b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengths.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py index cc8fe68aacc..ed2572368d3 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py +++ b/src/main/python/systemds/operator/algorithm/builtin/fixInvalidLengthsApply.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/frameSort.py b/src/main/python/systemds/operator/algorithm/builtin/frameSort.py index 0bfc7f3afec..2575baefe4b 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/frameSort.py +++ b/src/main/python/systemds/operator/algorithm/builtin/frameSort.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES diff --git a/src/main/python/systemds/operator/algorithm/builtin/glove.py b/src/main/python/systemds/operator/algorithm/builtin/glove.py new file mode 100644 index 00000000000..3df38dfbfbd --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/glove.py @@ -0,0 +1,68 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/glove.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def glove(input: Frame, + seed: int, + vector_size: int, + alpha: float, + eta: float, + x_max: float, + tol: float, + iterations: int, + print_loss_it: int, + maxTokens: int, + windowSize: int, + distanceWeighting: bool, + symmetric: bool): + """ + Computes the vector embeddings for words in a large text corpus. + + + + :param input: 1DInput corpus in CSV format. + :param seed: Random seed for reproducibility. + :param vector_size: Dimensionality of word vectors, V. + :param eta: Learning rate for optimization, recommended value: 0.05. + :param alpha: Weighting function parameter, recommended value: 0.75. + :param x_max: Maximum co-occurrence value as per the GloVe paper: 100. + :param tol: Tolerance value to avoid overfitting, recommended value: 1e-4. + :param iterations: Total number of training iterations. + :param print_loss_it: Interval (in iterations) for printing the loss. + :param maxTokens: Maximum number of tokens per text entry. + :param windowSize: Context window size. + :param distanceWeighting: Whether to apply distance-based weighting. + :param symmetric: Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). + :return: The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) + """ + + params_dict = {'input': input, 'seed': seed, 'vector_size': vector_size, 'alpha': alpha, 'eta': eta, 'x_max': x_max, 'tol': tol, 'iterations': iterations, 'print_loss_it': print_loss_it, 'maxTokens': maxTokens, 'windowSize': windowSize, 'distanceWeighting': distanceWeighting, 'symmetric': symmetric} + return Matrix(input.sds_context, + 'glove', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py b/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py index fcc096180b9..f04aa098514 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py +++ b/src/main/python/systemds/operator/algorithm/builtin/imputeByKNN.py @@ -25,13 +25,30 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES def imputeByKNN(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + Imputes missing values, indicated by NaNs, using KNN-based methods + (k-nearest neighbors by euclidean distance). In order to avoid NaNs in + distance computation and meaningful nearest neighbor search, we initialize + the missing values by column means. Currently, only the column with the most + missing values is actually imputed. + + + :param X: Matrix with missing values, which are represented as NaNs + :param method: Method used for imputing missing values with different performance and accuracy tradeoffs:\n + - 'dist' (default): Compute all-pairs distances and impute the missing values by closest. O(N^2 * #features) + - 'dist_missing': Compute distances between data and records with missing values. O(N*M * #features), assuming that the number of records with MV is M< M := [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) diff --git a/src/main/python/systemds/operator/algorithm/builtin/shapExplainer.py b/src/main/python/systemds/operator/algorithm/builtin/shapExplainer.py new file mode 100644 index 00000000000..42a0afb6e69 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/shapExplainer.py @@ -0,0 +1,78 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/shapExplainer.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def shapExplainer(model_function: str, + model_args: List, + x_instances: Matrix, + X_bg: Matrix, + **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + Computes shapley values for multiple instances in parallel using antithetic permutation sampling. + The resulting matrix phis holds the shapley values for each feature in the column given by the index of the feature in the sample. + + This method first creates two large matrices for masks and masked background data for all permutations and + then runs in paralell on all instances in x. + While the prepared matrices can become very large (2 * #features * #permuations * #n_samples * #features), + the preparation of a row for the model call breaks down to a single element-wise multiplication of this mask with the row and + an addition to the masked background data, since masks can be reused for each instance. + + + + :param model_function: The function of the model to be evaluated as a String. This function has to take a matrix of samples + and return a vector of predictions. + It might be usefull to wrap the model into a function the takes and returns the desired shapes and + use this wrapper here. + :param model_args: Arguments in order for the model, if desired. This will be prepended by the created instances-matrix. + :param x_instances: Multiple instances as rows for which to compute the shapley values. + :param X_bg: The background dataset from which to pull the random samples to perform Monte Carlo integration. + :param n_permutations: The number of permutaions. Defaults to 10. Theoretical 1 should already be enough for models with up + to second order interaction effects. + :param n_samples: Number of samples from X_bg used for marginalization. + :param remove_non_var: EXPERIMENTAL: If set, for every instance the varaince of each feature is checked against this feature in the + background data. If it does not change, we do not run any model cals for it. + :param seed: A seed, in case the sampling has to be deterministic. + :param verbose: A boolean to enable logging of each step of the function. + :return: Matrix holding the shapley values along the cols, one row per instance. + :return: Double holding the average prediction of all instances. + """ + + params_dict = {'model_function': model_function, 'model_args': model_args, 'x_instances': x_instances, 'X_bg': X_bg} + params_dict.update(kwargs) + + vX_0 = Matrix(model_function.sds_context, '') + vX_1 = Scalar(model_function.sds_context, '') + output_nodes = [vX_0, vX_1, ] + + op = MultiReturn(model_function.sds_context, 'shapExplainer', output_nodes, named_input_nodes=params_dict) + + vX_0._unnamed_input_nodes = [op] + vX_1._unnamed_input_nodes = [op] + + return op diff --git a/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py b/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py index 16a20d20e08..270a6d7b166 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py +++ b/src/main/python/systemds/operator/algorithm/builtin/topk_cleaning.py @@ -25,7 +25,6 @@ from typing import Dict, Iterable from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar -from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES @@ -36,8 +35,39 @@ def topk_cleaning(dataTrain: Frame, evalFunHp: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ - This function cleans top-K item (where K is given as input)for a given list of users. + This function cleans top-K item (where K is given as input) for a given list of users. metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask + + + + :param dataTrain: Training set + :param dataTest: Test set ignored when cv is set to True + :param metaData: 3×n frame with schema, categorical mask, and FD mask for dataTrain + :param primitives: Library of primitive cleaning operators + :param parameters: Hyperparameter search space that matches the primitives + :param refSol: Reference solution + :param evaluationFunc: Name of a SystemDS DML function that scores a pipeline + :param evalFunHp: Hyperparameter matrix for the above evaluation function + :param topK: Number of best pipelines to return + :param resource_val: Maximum resource R for the Bandit search + :param max_iter: Maximum iterations while enumerating logical pipelines + :param lq: Lower quantile used by utils::doErrorSample when triggered + :param uq: Upper quantile used by utils::doErrorSample when triggered + :param sample: Fraction of rows to subsample from dataTrain + :param expectedIncrease: Minimum improvement over dirtyScore that a candidate must deliver + :param seed: Seed number + :param cv: TRUE means k-fold CV, FALSE means hold-out split + :param cvk: Number of folds if cv = TRUE + :param isLastLabel: TRUE if the last column is the label + :param rowCount: Row-count threshold above which doErrorSample may replace uniform sampling + :param correctTypos: Run spelling correction in the string preprocessing step + :param enablePruning: Enable pruning inside the Bandit phase + :return: K cleaned-data pipelines + :return: Hyperparameter matrix with rows aligning with topKPipelines + :return: Evaluation scores with rows aligning with topKPipelines + :return: Baseline score on the unclean data + :return: Updated evaluation function hyperparameters + :return: Frame of “apply” functions for deploying each of the top-K pipelines """ params_dict = {'dataTrain': dataTrain, 'primitives': primitives, 'parameters': parameters, 'evaluationFunc': evaluationFunc, 'evalFunHp': evalFunHp} diff --git a/src/main/python/systemds/operator/algorithm/builtin/wer.py b/src/main/python/systemds/operator/algorithm/builtin/wer.py new file mode 100644 index 00000000000..99d278461cf --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/wer.py @@ -0,0 +1,48 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/wer.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def wer(R: Frame, + H: Frame): + """ + This built-in function computes the word error rate (WER) + defined as wer = (numSubst + numDel + numIns) / length(r) + + + + :param R: Input frame of reference strings, shape: [N x 1] + :param H: Input frame of hypothesis strings, shape: [N x 1] + :return: Output matrix of word error rate per pair of strings, + shape: [N x 1], where W[i,1] = wer(R[i,1], H[i,1]) + """ + + params_dict = {'R': R, 'H': H} + return Matrix(R.sds_context, + 'wer', + named_input_nodes=params_dict)