apache · Baunsgaard · Jul 13, 2025 · Sep 24, 2025
diff --git a/scripts/builtin/ampute.dml b/scripts/builtin/ampute.dml
@@ -30,7 +30,6 @@
 # mech         a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified
 # weights      a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR"
 # seed         a manually defined seed for reproducible RNG
-
 # -------------------------------------------------------------------------------------
 #
 # OUTPUT:

diff --git a/scripts/builtin/confusionMatrix.dml b/scripts/builtin/confusionMatrix.dml
@@ -23,7 +23,7 @@
 # and actual labels. We return both the counts and relative frequency
 # (normalized by sum of true labels)
 #
-# .. code-block::
+# .. code-block:: text
 #
 #                   True Labels
 #                     1    2

diff --git a/scripts/builtin/cooccurrenceMatrix.dml b/scripts/builtin/cooccurrenceMatrix.dml
@@ -18,22 +18,21 @@
 # under the License.
 #
 #-------------------------------------------------------------
-#
-# The implementation is based on
+
+# Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
+# Adds an index column to the result. The implementation is based on
 # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
 #
-#-------------------------------------------------------------
-
-## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
-## Adds an index column to the result.
 # INPUT:
 # ------------------------------------------------------------------------------
 # S     (Frame[Unknown]): 1D input data frame containing text data.
 # ------------------------------------------------------------------------------
+#
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # result    (Frame[Unknown]): Processed text data with an index column.
 # ------------------------------------------------------------------------------
+
 processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
     print("processText");
     tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
@@ -172,4 +171,4 @@ f_cooccurrenceMatrix = function(
     [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
     [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
     coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
-}
+}
diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
@@ -30,9 +30,9 @@
 #   and the following trees, M would look as follows:
 #
 #   (L1)               |d<5|
-#                     /     \
+#                     /     \\
 #   (L2)           P1:2    |a<7|
-#                          /   \
+#                          /   \\
 #   (L3)                 P2:2 P3:1
 #
 #   --> M :=

diff --git a/scripts/builtin/dedup.dml b/scripts/builtin/dedup.dml
@@ -28,11 +28,11 @@
 #
 # INPUT:
 # --------------------------------------------------------------------------------------
-# X                 Input Frame[String] with n rows and d columns (raw tuples)
-# gloveMatrix       Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion
-# vocab             Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix)
-# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean"
-# threshold         (optional) Double: threshold value above which tuples are considered duplicates
+# X                  Input Frame[String] with n rows and d columns (raw tuples)
+# gloveMatrix        Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion
+# vocab              Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix)
+# similarityMeasure  (optional) String specifying similarity metric: "cosine", "euclidean"
+# threshold          (optional) Double: threshold value above which tuples are considered duplicates
 # --------------------------------------------------------------------------------------
 #
 # OUTPUT:

diff --git a/scripts/builtin/differenceStatistics.dml b/scripts/builtin/differenceStatistics.dml
@@ -28,6 +28,11 @@
 # X        First Matrix to compare
 # Y        Second Matrix to compare
 # --------------------------------------------------------------------------------
+#
+# OUTPUT:
+# -------------------------------------------------------------------------------------
+# stats.   Difference statistics
+# -------------------------------------------------------------------------------------
 
 m_differenceStatistics = function(Matrix[Double] X, Matrix[Double] Y)  {
 

diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml
@@ -18,6 +18,51 @@
 # under the License.
 #-------------------------------------------------------------
 
+
+# Computes the vector embeddings for words in a large text corpus. 
+#
+# INPUT:
+# -------------------------------------------------------------------------------- 
+# input                 1DInput corpus in CSV format.
+# seed                  Random seed for reproducibility.
+# vector_size           Dimensionality of word vectors, V.
+# eta                   Learning rate for optimization, recommended value: 0.05.
+# alpha                 Weighting function parameter, recommended value: 0.75.
+# x_max                 Maximum co-occurrence value as per the GloVe paper: 100.
+# tol                   Tolerance value to avoid overfitting, recommended value: 1e-4.
+# iterations            Total number of training iterations.
+# print_loss_it         Interval (in iterations) for printing the loss.
+# maxTokens             Maximum number of tokens per text entry.
+# windowSize            Context window size.
+# distanceWeighting     Whether to apply distance-based weighting.
+# symmetric             Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
+# ------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# G                     The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
+# ------------------------------------------------------------------------------
+
+
+f_glove = function(
+    Frame[Unknown] input,
+    int seed, int vector_size,
+    double alpha, double eta,
+    double x_max,
+    double tol,
+    int iterations,
+    int print_loss_it,
+    Int maxTokens,
+    Int windowSize,
+    Boolean distanceWeighting,
+    Boolean symmetric)
+    return (frame[Unknown] G){
+
+        [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
+        G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
+}
+
+
 init = function(matrix[double] cooc_matrix, double x_max, double alpha)
   return(matrix[double] weights, matrix[double] log_cooc_matrix){
   E = 2.718281828;
@@ -118,45 +163,3 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i
     print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error);
     G = cbind(cooc_index[,2], as.frame(G));
 }
-
-glove = function(
-    Frame[Unknown] input,
-    int seed, int vector_size,
-    double alpha, double eta,
-    double x_max,
-    double tol,
-    int iterations,
-    int print_loss_it,
-    Int maxTokens,
-    Int windowSize,
-    Boolean distanceWeighting,
-    Boolean symmetric)
-    return (frame[Unknown] G){
-
-        /*
-        * Main function to Computes the vector embeddings for words in a large text corpus.
-        * INPUT:
-        * ------------------------------------------------------------------------------
-        * - input (Frame[Unknown]): 1DInput corpus in CSV format.
-        * - seed: Random seed for reproducibility.
-        * - vector_size: Dimensionality of word vectors, V.
-        * - eta: Learning rate for optimization, recommended value: 0.05.
-        * - alpha: Weighting function parameter, recommended value: 0.75.
-        * - x_max: Maximum co-occurrence value as per the GloVe paper: 100.
-        * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4.
-        * - iterations: Total number of training iterations.
-        * - print_loss_it: Interval (in iterations) for printing the loss.
-        * - maxTokens (Int): Maximum number of tokens per text entry.
-        * - windowSize (Int): Context window size.
-        * - distanceWeighting (Boolean): Whether to apply distance-based weighting.
-        * - symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
-        * ------------------------------------------------------------------------------
-        * OUTPUT:
-        * ------------------------------------------------------------------------------
-        * G (Frame[Unknown]): The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
-        * ------------------------------------------------------------------------------
-        */
-
-        [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
-        G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
-}
diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml
@@ -25,23 +25,16 @@
 # the missing values by column means. Currently, only the column with the most
 # missing values is actually imputed.
 #
-# ------------------------------------------------------------------------------
 # INPUT:
 # ------------------------------------------------------------------------------
-# X           Matrix with missing values, which are represented as NaNs
-# method      Method used for imputing missing values with different performance
-#             and accuracy tradeoffs:
-#             'dist' (default): Compute all-pairs distances and impute the
-#                               missing values by closest. O(N^2 * #features)
-#             'dist_missing':   Compute distances between data and records with
-#                               missing values. O(N*M * #features), assuming
-#                               that the number of records with MV is M<<N.
-#             'dist_sample':    Compute distances between sample of data and
-#                               records with missing values. O(S*M * #features)
-#                               with M<<N and S<<N, but suboptimal imputation.
-# seed        Root seed value for random/sample calls for deterministic behavior
-#             -1 for true randomization
-# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
+# X             Matrix with missing values, which are represented as NaNs
+# method        Method used for imputing missing values with different performance and accuracy tradeoffs:\n
+#               - 'dist' (default): Compute all-pairs distances and impute the missing values by closest. O(N^2 * #features)
+#               - 'dist_missing': Compute distances between data and records with missing values. O(N*M * #features), assuming that the number of records with MV is M<<N.
+#               - 'dist_sample': Compute distances between sample of data and records with missing values. O(S*M * #features) with M<<N and S<<N, but suboptimal imputation.
+#
+# seed          Root seed value for random/sample calls for deterministic behavior. -1 for true randomization
+# sample_frac   Sample fraction for 'dist_sample' (value between 0 and 1)
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
@@ -136,4 +129,4 @@ compute_missing_values = function (Matrix[Double] X, Matrix[Double] filled_matri
     #Get the subset records that need to be imputed
     imputedValue = t(reshaped) %*% aligned
     imputedValue = t(imputedValue)
-}
+}
diff --git a/scripts/builtin/quantizeByCluster.dml b/scripts/builtin/quantizeByCluster.dml
@@ -58,7 +58,7 @@
 #           the product quantization. Only relevant when space_decomp = TRUE.
 # ------------------------------------------------------------------------------------------
 
-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,
     Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)
   return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
 {
@@ -118,5 +118,4 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, I
       codes[,i] = tmp_c + offset
     }
   }
-}
-
+}
diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
@@ -26,16 +26,17 @@
 # and optionally subset of features (columns). During tree construction, split
 # candidates are additionally chosen on a sample of remaining features.
 #
-# .. code-block::
+# .. code-block:: text
 #
 #   For example, given a feature matrix with features [a,b,c,d]
 #   and the following two trees, M (the output) would look as follows:
 #
 #   (L1)          |a<7|                   |d<5|
-#                /     \                 /     \
+#                /     \\                 /     \\
 #   (L2)     |c<3|     |b<4|         |a<7|     P3:2
-#            /   \     /   \         /   \
+#            /   \\     /   \\         /  \\
 #   (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+#
 #   --> M :=
 #   [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
 #    [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)

diff --git a/scripts/builtin/shapExplainer.dml b/scripts/builtin/shapExplainer.dml
@@ -51,6 +51,7 @@
 # S              Matrix holding the shapley values along the cols, one row per instance.
 # expected       Double holding the average prediction of all instances.
 # -----------------------------------------------------------------------------
+
 s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances,
     Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0,
     Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0)

diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
@@ -19,8 +19,44 @@
 #
 #-------------------------------------------------------------
 
-# This function cleans top-K item (where K is given as input)for a given list of users.
+# This function cleans top-K item (where K is given as input) for a given list of users.
 # metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask
+#
+# INPUT:
+# ------------------------------------------------------------------------------
+# dataTrain           Training set
+# dataTest            Test set ignored when cv is set to True
+# metaData            3×n frame with schema, categorical mask, and FD mask for dataTrain
+# primitives          Library of primitive cleaning operators
+# parameters          Hyperparameter search space that matches the primitives
+# refSol              Reference solution
+# evaluationFunc      Name of a SystemDS DML function that scores a pipeline
+# evalFunHp           Hyperparameter matrix for the above evaluation function
+# topK                Number of best pipelines to return
+# resource_val        Maximum resource R for the Bandit search
+# max_iter            Maximum iterations while enumerating logical pipelines
+# lq                  Lower quantile used by utils::doErrorSample when triggered
+# uq                  Upper quantile used by utils::doErrorSample when triggered
+# sample              Fraction of rows to subsample from dataTrain
+# expectedIncrease    Minimum improvement over dirtyScore that a candidate must deliver
+# seed                Seed number
+# cv                  TRUE means k-fold CV, FALSE means hold-out split
+# cvk                 Number of folds if cv = TRUE
+# isLastLabel         TRUE if the last column is the label
+# rowCount            Row-count threshold above which doErrorSample may replace uniform sampling
+# correctTypos        Run spelling correction in the string preprocessing step
+# enablePruning       Enable pruning inside the Bandit phase
+# ------------------------------------------------------------------------------
+#
+# OUTPUT:
+#-------------------------------------------------------------------------------
+# topKPipelines       K cleaned-data pipelines
+# topKHyperParams     Hyperparameter matrix with rows aligning with topKPipelines
+# topKScores          Evaluation scores with rows aligning with topKPipelines
+# dirtyScore          Baseline score on the unclean data
+# evalFunHp           Updated evaluation function hyperparameters
+# applyFunc           Frame of “apply” functions for deploying each of the top-K pipelines
+#-------------------------------------------------------------------------------
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;

diff --git a/src/main/python/docs/README.md b/src/main/python/docs/README.md
@@ -39,4 +39,4 @@ and then run `make html`:
 make html
 ```
 
-The docs will then be created at: `/src/main/python/build`in HTML will be placed in the `./_build` directory.
+The docs will then be created at: `/src/main/python/docs/build/html/`.
diff --git a/src/main/python/docs/requires-docs.txt b/src/main/python/docs/requires-docs.txt
@@ -24,4 +24,5 @@ sphinx_rtd_theme
 numpy
 py4j
 scipy
-requests
+requests
+pandas
diff --git a/src/main/python/generator/dml_parser.py b/src/main/python/generator/dml_parser.py
@@ -28,7 +28,7 @@
 class FunctionParser(object):
     header_input_pattern = r"^[ \t\n]*[#]+[ \t\n]*input[ \t\n\w:;.,#]*[\s#\-]*[#]+[\w\s\d:,.()\" \t\n\-]*[\s#\-]*$"
     header_output_pattern = r"[\s#\-]*[#]+[ \t]*(return|output)[ \t\w:;.,#]*[\s#\-]*[#]+[\w\s\d:,.()\" \t\-]*[\s#\-]*$"
-    function_pattern = r"^[ms]_[\w]+[ \t\n]*=[ \t\n]+function[^#{]*"
+    function_pattern = r"^[fms]_[\w]+[ \t\n]*=[ \t\n]+function[^#{]*"
     # parameter_pattern = r"^m_[\w]+[\s]+=[\s]+function[\s]*\([\s]*(?=return)[\s]*\)[\s]*return[\s]*\([\s]*([\w\[\]\s,\d=.\-_]*)[\s]*\)[\s]*"
     header_parameter_pattern = r"[\s#\-]*[#]+[ \t]*([\w|-]+)[\s]+([\w]+)[\s]+([\w,\d.\"\-]+)[\s]+([\w|\W]+)"
     divider_pattern = r"[\s#\-]*"
@@ -57,15 +57,13 @@ def parse_function(self, path: str):
         """
         file_name = os.path.basename(path)
         function_name, extension = os.path.splitext(file_name)
-        # try:
-        function_definition = self.find_function_definition(path)
-        # pattern = re.compile(
-        #     self.__class__.parameter_pattern, flags=re.I | re.M)
-        # match = pattern.match(function_definition)
-
-        # if match:
+        try:
+            function_definition = self.find_function_definition(path)
+        except AttributeError:
+            print(f"[INFO] Skipping '{function_name}': does not match function name pattern. It is likely an internal function.")
+            return
 
-        func_split = function_definition.split("function")[1].split("return")
+        func_split = function_definition.split("function", 1)[1].split("return")
 
         param_str = self.extract_param_str(func_split[0])
         retval_str = None
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,4 +24,5 @@ sphinx_rtd_theme @@
     numpy
     py4j
     scipy
-    requests
+    requests
+    pandas