diff --git a/src/.vuepress/sidebar/V1.3.x/en.ts b/src/.vuepress/sidebar/V1.3.x/en.ts index 99e5c9ac5..2bdfc6031 100644 --- a/src/.vuepress/sidebar/V1.3.x/en.ts +++ b/src/.vuepress/sidebar/V1.3.x/en.ts @@ -83,6 +83,7 @@ export const enSidebar = { { text: 'Stream Processing', link: 'Streaming_apache' }, { text: 'Data Sync', link: 'Data-Sync_apache' }, { text: 'Database Programming', link: 'Database-Programming' }, + { text: 'UDF', link: 'User-defined-function' }, { text: 'Database Administration', link: 'Authority-Management' }, { text: 'Maintennance', link: 'Maintennance' }, ], @@ -164,6 +165,7 @@ export const enSidebar = { // children: 'structure', children: [ { text: 'UDF Libraries', link: 'UDF-Libraries' }, + { text: 'UDF development', link: 'UDF-development' }, { text: 'Function and Expression', link: 'Function-and-Expression' }, { text: 'Common Config Manual', link: 'Common-Config-Manual' }, { text: 'Status Codes', link: 'Status-Codes' }, diff --git a/src/.vuepress/sidebar/V1.3.x/zh.ts b/src/.vuepress/sidebar/V1.3.x/zh.ts index 4f94062a4..1dbcd3d7d 100644 --- a/src/.vuepress/sidebar/V1.3.x/zh.ts +++ b/src/.vuepress/sidebar/V1.3.x/zh.ts @@ -83,6 +83,7 @@ export const zhSidebar = { { text: '流处理框架', link: 'Streaming_apache' }, { text: '数据同步', link: 'Data-Sync_apache' }, { text: '数据库编程', link: 'Database-Programming' }, + { text: '用户自定义函数', link: 'User-defined-function' }, { text: '权限管理', link: 'Authority-Management' }, { text: '运维语句', link: 'Maintennance' }, ], @@ -164,6 +165,7 @@ export const zhSidebar = { // children: 'structure', children: [ { text: 'UDF函数库', link: 'UDF-Libraries' }, + { text: 'UDF开发', link: 'UDF-development' }, { text: '内置函数与表达式', link: 'Function-and-Expression' }, { text: '配置参数', link: 'Common-Config-Manual' }, { text: 'ConfigNode配置参数', link: 'ConfigNode-Config-Manual' }, diff --git a/src/.vuepress/sidebar_timecho/V1.3.x/en.ts b/src/.vuepress/sidebar_timecho/V1.3.x/en.ts index e1c20151f..07140747b 100644 --- a/src/.vuepress/sidebar_timecho/V1.3.x/en.ts +++ b/src/.vuepress/sidebar_timecho/V1.3.x/en.ts @@ -90,6 +90,7 @@ export const enSidebar = { { text: 'View', link: 'IoTDB-View_timecho' }, { text: 'AI Capability', link: 'AINode_timecho' }, { text: 'Database Programming', link: 'Database-Programming' }, + { text: 'UDF', link: 'User-defined-function' }, { text: 'Security Management', link: 'Security-Management_timecho' }, { text: 'Database Administration', link: 'Authority-Management' }, { text: 'Maintennance', link: 'Maintennance' }, @@ -175,6 +176,7 @@ export const enSidebar = { // children: 'structure', children: [ { text: 'UDF Libraries', link: 'UDF-Libraries' }, + { text: 'UDF development', link: 'UDF-development' }, { text: 'Function and Expression', link: 'Function-and-Expression' }, { text: 'Common Config Manual', link: 'Common-Config-Manual' }, { text: 'ConfigNode Config Manual', link: 'ConfigNode-Config-Manual' }, diff --git a/src/.vuepress/sidebar_timecho/V1.3.x/zh.ts b/src/.vuepress/sidebar_timecho/V1.3.x/zh.ts index 3785d1f03..d9e7e3520 100644 --- a/src/.vuepress/sidebar_timecho/V1.3.x/zh.ts +++ b/src/.vuepress/sidebar_timecho/V1.3.x/zh.ts @@ -90,6 +90,7 @@ export const zhSidebar = { { text: '视图', link: 'IoTDB-View_timecho' }, { text: 'AI能力', link: 'AINode_timecho' }, { text: '数据库编程', link: 'Database-Programming' }, + { text: '用户自定义函数', link: 'User-defined-function' }, { text: '安全控制', link: 'Security-Management_timecho' }, { text: '权限管理', link: 'Authority-Management' }, { text: '运维语句', link: 'Maintennance' }, @@ -175,6 +176,7 @@ export const zhSidebar = { // children: 'structure', children: [ { text: 'UDF函数库', link: 'UDF-Libraries' }, + { text: 'UDF开发', link: 'UDF-development' }, { text: '内置函数与表达式', link: 'Function-and-Expression' }, { text: '配置参数', link: 'Common-Config-Manual' }, { text: 'ConfigNode配置参数', link: 'ConfigNode-Config-Manual' }, diff --git a/src/UserGuide/Master/Reference/UDF-Libraries.md b/src/UserGuide/Master/Reference/UDF-Libraries.md index e3364d513..9361ac142 100644 --- a/src/UserGuide/Master/Reference/UDF-Libraries.md +++ b/src/UserGuide/Master/Reference/UDF-Libraries.md @@ -21,10 +21,39 @@ # UDF Libraries +# UDF Libraries + +Based on the ability of user-defined functions, IoTDB provides a series of functions for temporal data processing, including data quality, data profiling, anomaly detection, frequency domain analysis, data matching, data repairing, sequence discovery, machine learning, etc., which can meet the needs of industrial fields for temporal data processing. + +## Installation steps + +1. Please obtain the compressed file of the UDF library JAR package that is compatible with the IoTDB version. + + | UDF libraries version | Supported IoTDB versions | Download link | + | --------------- | ----------------- | ------------------------------------------------------------ | + | UDF-1.3.3.zip | V1.3.3 and above | [UDF.zip](https://alioss.timecho.com/upload/UDF-1.3.3.zip) | + | UDF-1.3.2.zip | V1.0.0~V1.3.2 | [UDF.zip](https://alioss.timecho.com/upload/UDF-1.3.2.zip) | + +2. Place the library-udf.jar file from the obtained compressed package in the path of IoTDB at `/iotdb-enterprise-x.x.x.x-bin/ext/udf` +3. In the SQL command line terminal (CLI) or visualization console (Workbench) SQL operation interface of IoTDB, execute the corresponding function registration statement as follows. +4. Batch registration: Two registration methods: registration script or SQL full statement +- Register Script + - Copy the registration script (register-UDF.sh or register-UDF.bat) from the compressed package to the `tools` directory of IoTDB as needed, and modify the parameters in the script (default is host=127.0.0.1, rpcPort=6667, user=root, pass=root); + - Start IoTDB service, run registration script to batch register UDF + +- All SQL statements + - Open the SQl file in the compressed package, copy all SQL statements, and execute all SQl statements in the SQL command line terminal (CLI) of IoTDB or the SQL operation interface of the visualization console (Workbench) to batch register UDF + ## Data Quality ### Completeness +#### Registration statement + +```sql +create function completeness as 'org.apache.iotdb.library.dquality.UDTFCompleteness' +``` + #### Usage This function is used to calculate the completeness of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the completeness of each window will be output. @@ -150,6 +179,12 @@ Output series: ### Consistency +#### Registration statement + +```sql +create function consistency as 'org.apache.iotdb.library.dquality.UDTFConsistency' +``` + #### Usage This function is used to calculate the consistency of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the consistency of each window will be output. @@ -274,6 +309,12 @@ Output series: ### Timeliness +#### Registration statement + +```sql +create function timeliness as 'org.apache.iotdb.library.dquality.UDTFTimeliness' +``` + #### Usage This function is used to calculate the timeliness of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the timeliness of each window will be output. @@ -398,6 +439,12 @@ Output series: ### Validity +#### Registration statement + +```sql +create function validity as 'org.apache.iotdb.library.dquality.UDTFValidity' +``` + #### Usage This function is used to calculate the Validity of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the Validity of each window will be output. @@ -547,11 +594,17 @@ Output series: ### ACF +#### Registration statement + +```sql +create function acf as 'org.apache.iotdb.library.dprofile.UDTFACF' +``` + #### Usage This function is used to calculate the auto-correlation factor of the input time series, which equals to cross correlation between the same series. -For more information, please refer to [XCorr](./UDF-Libraries.md#XCorr) function. +For more information, please refer to [XCorr](./UDF-Libraries.md#xcorr) function. **Name:** ACF @@ -606,6 +659,12 @@ Output series: ### Distinct +#### Registration statement + +```sql +create function distinct as 'org.apache.iotdb.library.dprofile.UDTFDistinct' +``` + #### Usage This function returns all unique values in time series. @@ -659,6 +718,12 @@ Output series: ### Histogram +#### Registration statement + +```sql +create function histogram as 'org.apache.iotdb.library.dprofile.UDTFHistogram' +``` + #### Usage This function is used to calculate the distribution histogram of a single column of numerical data. @@ -738,6 +803,12 @@ Output series: ### Integral +#### Registration statement + +```sql +create function integral as 'org.apache.iotdb.library.dprofile.UDAFIntegral' +``` + #### Usage This function is used to calculate the integration of time series, @@ -829,6 +900,12 @@ $$\frac{1}{2\times 60}[(1+2) \times 1 + (2+5) \times 1 + (5+6) \times 1 + (6+7) ### IntegralAvg +#### Registration statement + +```sql +create function integralavg as 'org.apache.iotdb.library.dprofile.UDAFIntegralAvg' +``` + #### Usage This function is used to calculate the function average of time series. @@ -890,6 +967,12 @@ $$\frac{1}{2}[(1+2) \times 1 + (2+5) \times 1 + (5+6) \times 1 + (6+7) \times 1 ### Mad +#### Registration statement + +```sql +create function mad as 'org.apache.iotdb.library.dprofile.UDAFMad' +``` + #### Usage The function is used to compute the exact or approximate median absolute deviation (MAD) of a numeric time series. MAD is the median of the deviation of each element from the elements' median. @@ -988,6 +1071,12 @@ Output series: ### Median +#### Registration statement + +```sql +create function median as 'org.apache.iotdb.library.dprofile.UDAFMedian' +``` + #### Usage The function is used to compute the exact or approximate median of a numeric time series. Median is the value separating the higher half from the lower half of a data sample. @@ -1058,6 +1147,12 @@ Output series: ### MinMax +#### Registration statement + +```sql +create function minmax as 'org.apache.iotdb.library.dprofile.UDTFMinMax' +``` + #### Usage This function is used to standardize the input series with min-max. Minimum value is transformed to 0; maximum value is transformed to 1. @@ -1197,6 +1292,12 @@ Output series: ### MvAvg +#### Registration statement + +```sql +create function mvavg as 'org.apache.iotdb.library.dprofile.UDTFMvAvg' +``` + #### Usage This function is used to calculate moving average of input series. @@ -1277,6 +1378,12 @@ Output series: ### PACF +#### Registration statement + +```sql +create function pacf as 'org.apache.iotdb.library.dprofile.UDTFPACF' +``` + #### Usage This function is used to calculate partial autocorrelation of input series by solving Yule-Walker equation. For some cases, the equation may not be solved, and NaN will be output. @@ -1346,6 +1453,12 @@ Output series: ### Percentile +#### Registration statement + +```sql +create function percentile as 'org.apache.iotdb.library.dprofile.UDAFPercentile' +``` + #### Usage The function is used to compute the exact or approximate percentile of a numeric time series. A percentile is value of element in the certain rank of the sorted series. @@ -1419,6 +1532,12 @@ Output series: ### Quantile +#### Registration statement + +```sql +create function quantile as 'org.apache.iotdb.library.dprofile.UDAFQuantile' +``` + #### Usage The function is used to compute the approximate quantile of a numeric time series. A quantile is value of element in the certain rank of the sorted series. @@ -1492,6 +1611,12 @@ Output series: ### Period +#### Registration statement + +```sql +create function period as 'org.apache.iotdb.library.dprofile.UDAFPeriod' +``` + #### Usage The function is used to compute the period of a numeric time series. @@ -1541,6 +1666,12 @@ Output series: ### QLB +#### Registration statement + +```sql +create function qlb as 'org.apache.iotdb.library.dprofile.UDTFQLB' +``` + #### Usage This function is used to calculate Ljung-Box statistics $Q_{LB}$ for time series, and convert it to p value. @@ -1625,6 +1756,12 @@ Output series: ### Resample +#### Registration statement + +```sql +create function re_sample as 'org.apache.iotdb.library.dprofile.UDTFResample' +``` + #### Usage This function is used to resample the input series according to a given frequency, @@ -1754,6 +1891,12 @@ Output series: ### Sample +#### Registration statement + +```sql +create function sample as 'org.apache.iotdb.library.dprofile.UDTFSample' +``` + #### Usage This function is used to sample the input series, @@ -1852,6 +1995,12 @@ Output series: ### Segment +#### Registration statement + +```sql +create function segment as 'org.apache.iotdb.library.dprofile.UDTFSegment' +``` + #### Usage This function is used to segment a time series into subsequences according to linear trend, and returns linear fitted values of first values in each subsequence or every data point. @@ -1944,6 +2093,12 @@ Output series: ### Skew +#### Registration statement + +```sql +create function skew as 'org.apache.iotdb.library.dprofile.UDAFSkew' +``` + #### Usage This function is used to calculate the population skewness. @@ -2005,6 +2160,12 @@ Output series: ### Spline +#### Registration statement + +```sql +create function spline as 'org.apache.iotdb.library.dprofile.UDTFSpline' +``` + #### Usage This function is used to calculate cubic spline interpolation of input series. @@ -2210,6 +2371,12 @@ Output series: ### Spread +#### Registration statement + +```sql +create function spread as 'org.apache.iotdb.library.dprofile.UDAFSpread' +``` + #### Usage This function is used to calculate the spread of time series, that is, the maximum value minus the minimum value. @@ -2327,6 +2494,12 @@ Output series: ### ZScore +#### Registration statement + +```sql +create function zscore as 'org.apache.iotdb.library.dprofile.UDTFZScore' +``` + #### Usage This function is used to standardize the input series with z-score. @@ -2433,6 +2606,12 @@ Output series: ### IQR +#### Registration statement + +```sql +create function iqr as 'org.apache.iotdb.library.anomaly.UDTFIQR' +``` + #### Usage This function is used to detect anomalies based on IQR. Points distributing beyond 1.5 times IQR are selected. @@ -2500,6 +2679,12 @@ Output series: ### KSigma +#### Registration statement + +```sql +create function ksigma as 'org.apache.iotdb.library.anomaly.UDTFKSigma' +``` + #### Usage This function is used to detect anomalies based on the Dynamic K-Sigma Algorithm. @@ -2565,6 +2750,12 @@ Output series: ### LOF +#### Registration statement + +```sql +create function LOF as 'org.apache.iotdb.library.anomaly.UDTFLOF' +``` + #### Usage This function is used to detect density anomaly of time series. According to k-th distance calculation parameter and local outlier factor (lof) threshold, the function judges if a set of input values is an density anomaly, and a bool mark of anomaly values will be output. @@ -2691,6 +2882,12 @@ Output series: ### MissDetect +#### Registration statement + +```sql +create function missdetect as 'org.apache.iotdb.library.anomaly.UDTFMissDetect' +``` + #### Usage This function is used to detect missing anomalies. @@ -2779,6 +2976,12 @@ Output series: ### Range +#### Registration statement + +```sql +create function range as 'org.apache.iotdb.library.anomaly.UDTFRange' +``` + #### Usage This function is used to detect range anomaly of time series. According to upper bound and lower bound parameters, the function judges if a input value is beyond range, aka range anomaly, and a new time series of anomaly will be output. @@ -2844,6 +3047,12 @@ Output series: ### TwoSidedFilter +#### Registration statement + +```sql +create function twosidedfilter as 'org.apache.iotdb.library.anomaly.UDTFTwoSidedFilter' +``` + #### Usage The function is used to filter anomalies of a numeric time series based on two-sided window detection. @@ -2937,6 +3146,12 @@ Output series: ### Outlier +#### Registration statement + +```sql +create function outlier as 'org.apache.iotdb.library.anomaly.UDTFOutlier' +``` + #### Usage This function is used to detect distance-based outliers. For each point in the current window, if the number of its neighbors within the distance of neighbor distance threshold is less than the neighbor count threshold, the point in detected as an outlier. @@ -3260,6 +3475,12 @@ Output series: ### Conv +#### Registration statement + +```sql +create function conv as 'org.apache.iotdb.library.frequency.UDTFConv' +``` + #### Usage This function is used to calculate the convolution, i.e. polynomial multiplication. @@ -3307,6 +3528,12 @@ Output series: ### Deconv +#### Registration statement + +```sql +create function deconv as 'org.apache.iotdb.library.frequency.UDTFDeconv' +``` + #### Usage This function is used to calculate the deconvolution, i.e. polynomial division. @@ -3387,6 +3614,12 @@ Output series: ### DWT +#### Registration statement + +```sql +create function dwt as 'org.apache.iotdb.library.frequency.UDTFDWT' +``` + #### Usage This function is used to calculate 1d discrete wavelet transform of a numerical series. @@ -3468,6 +3701,12 @@ Output series: ### FFT +#### Registration statement + +```sql +create function fft as 'org.apache.iotdb.library.frequency.UDTFFFT' +``` + #### Usage This function is used to calculate the fast Fourier transform (FFT) of a numerical series. @@ -3592,6 +3831,12 @@ The last data point is reserved to indicate the length of the series. ### HighPass +#### Registration statement + +```sql +create function highpass as 'org.apache.iotdb.library.frequency.UDTFHighPass' +``` + #### Usage This function performs low-pass filtering on the input series and extracts components above the cutoff frequency. @@ -3679,6 +3924,12 @@ Note: The input is $y=sin(2\pi t/4)+2sin(2\pi t/5)$ with a length of 20. Thus, t ### IFFT +#### Registration statement + +```sql +create function ifft as 'org.apache.iotdb.library.frequency.UDTFIFFT' +``` + #### Usage This function treats the two input series as the real and imaginary part of a complex series, performs an inverse fast Fourier transform (IFFT), and outputs the real part of the result. @@ -3756,6 +4007,12 @@ Output series: ### LowPass +#### Registration statement + +```sql +create function lowpass as 'org.apache.iotdb.library.frequency.UDTFLowPass' +``` + #### Usage This function performs low-pass filtering on the input series and extracts components below the cutoff frequency. @@ -3866,6 +4123,12 @@ Note: The input is $y=sin(2\pi t/4)+2sin(2\pi t/5)$ with a length of 20. Thus, t ### Cov +#### Registration statement + +```sql +create function cov as 'org.apache.iotdb.library.dmatch.UDAFCov' +``` + #### Usage This function is used to calculate the population covariance. @@ -3927,6 +4190,12 @@ Output series: ### DTW +#### Registration statement + +```sql +create function dtw as 'org.apache.iotdb.library.dmatch.UDAFDtw' +``` + #### Usage This function is used to calculate the DTW distance between two input series. @@ -3992,6 +4261,12 @@ Output series: ### Pearson +#### Registration statement + +```sql +create function pearson as 'org.apache.iotdb.library.dmatch.UDAFPearson' +``` + #### Usage This function is used to calculate the Pearson Correlation Coefficient. @@ -4053,6 +4328,12 @@ Output series: ### PtnSym +#### Registration statement + +```sql +create function ptnsym as 'org.apache.iotdb.library.dmatch.UDTFPtnSym' +``` + #### Usage This function is used to find all symmetric subseries in the input whose degree of symmetry is less than the threshold. @@ -4113,6 +4394,12 @@ Output series: ### XCorr +#### Registration statement + +```sql +create function xcorr as 'org.apache.iotdb.library.dmatch.UDTFXCorr' +``` + #### Usage This function is used to calculate the cross correlation function of given two time series. @@ -4202,6 +4489,14 @@ Output series: ### TimestampRepair +#### Registration statement + +```sql +create function timestamprepair as 'org.apache.iotdb.library.drepair.UDTFTimestampRepair' +``` + +#### Usage + This function is used for timestamp repair. According to the given standard time interval, the method of minimizing the repair cost is adopted. @@ -4303,6 +4598,12 @@ Output series: ### ValueFill +#### Registration statement + +```sql +create function valuefill as 'org.apache.iotdb.library.drepair.UDTFValueFill' +``` + #### Usage This function is used to impute time series. Several methods are supported. @@ -4415,6 +4716,12 @@ Output series: ### ValueRepair +#### Registration statement + +```sql +create function valuerepair as 'org.apache.iotdb.library.drepair.UDTFValueRepair' +``` + #### Usage This function is used to repair the value of the time series. @@ -4723,6 +5030,12 @@ Output series: ### ConsecutiveSequences +#### Registration statement + +```sql +create function consecutivesequences as 'org.apache.iotdb.library.series.UDTFConsecutiveSequences' +``` + #### Usage This function is used to find locally longest consecutive subsequences in strictly equispaced multidimensional data. @@ -4811,6 +5124,12 @@ Output series: ### ConsecutiveWindows +#### Registration statement + +```sql +create function consecutivewindows as 'org.apache.iotdb.library.series.UDTFConsecutiveWindows' +``` + #### Usage This function is used to find consecutive windows of specified length in strictly equispaced multidimensional data. @@ -4897,6 +5216,12 @@ Output series: ### AR +#### Registration statement + +```sql +create function ar as 'org.apache.iotdb.library.dlearn.UDTFAR' +``` + #### Usage This function is used to learn the coefficients of the autoregressive models for a time series. diff --git a/src/UserGuide/Master/Reference/UDF-development.md b/src/UserGuide/Master/Reference/UDF-development.md new file mode 100644 index 000000000..8e14f31db --- /dev/null +++ b/src/UserGuide/Master/Reference/UDF-development.md @@ -0,0 +1,646 @@ + # UDF development + +## UDF development + +### UDF Development Dependencies + +If you use [Maven](http://search.maven.org/), you can search for the development dependencies listed below from the [Maven repository](http://search.maven.org/) . Please note that you must select the same dependency version as the target IoTDB server version for development. + +``` xml + + org.apache.iotdb + udf-api + 1.0.0 + provided + +``` + +## UDTF(User Defined Timeseries Generating Function) + +To write a UDTF, you need to inherit the `org.apache.iotdb.udf.api.UDTF` class, and at least implement the `beforeStart` method and a `transform` method. + +#### Interface Description: + +| Interface definition | Description | Required to Implement | +| :----------------------------------------------------------- | :----------------------------------------------------------- | ----------------------------------------------------- | +| void validate(UDFParameterValidator validator) throws Exception | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | +| void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception | The initialization method to call the user-defined initialization behavior before a UDTF processes the input data. Every time a user executes a UDTF query, the framework will construct a new UDF instance, and `beforeStart` will be called. | Required | +| void transform(Row row, PointCollector collector) throws Exception | This method is called by the framework. This data processing method will be called when you choose to use the `RowByRowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `Row`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | +| void transform(RowWindow rowWindow, PointCollector collector) throws Exception | This method is called by the framework. This data processing method will be called when you choose to use the `SlidingSizeWindowAccessStrategy` or `SlidingTimeWindowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `RowWindow`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | +| void terminate(PointCollector collector) throws Exception | This method is called by the framework. This method will be called once after all `transform` calls have been executed. In a single UDF query, this method will and will only be called once. You need to call the data collection method provided by `collector` to determine the output data. | Optional | +| void beforeDestroy() | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | + +In the life cycle of a UDTF instance, the calling sequence of each method is as follows: + +1. void validate(UDFParameterValidator validator) throws Exception +2. void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception +3. void transform(Row row, PointCollector collector) throws Exception or void transform(RowWindow rowWindow, PointCollector collector) throws Exception +4. void terminate(PointCollector collector) throws Exception +5. void beforeDestroy() + +> Note that every time the framework executes a UDTF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDTF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDTF without considering the influence of concurrency and other factors. + +#### Detailed interface introduction: + +1. **void validate(UDFParameterValidator validator) throws Exception** + +The `validate` method is used to validate the parameters entered by the user. + +In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. + +Please refer to the Javadoc for the usage of `UDFParameterValidator`. + + +2. **void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception** + +This method is mainly used to customize UDTF. In this method, the user can do the following things: + +1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. +2. Set the strategy to access the raw data and set the output data type in UDTFConfigurations. +3. Create resources, such as establishing external connections, opening files, etc. + + +2.1 **UDFParameters** + +`UDFParameters` is used to parse UDF parameters in SQL statements (the part in parentheses after the UDF function name in SQL). The input parameters have two parts. The first part is data types of the time series that the UDF needs to process, and the second part is the key-value pair attributes for customization. Only the second part can be empty. + + +Example: + +``` sql +SELECT UDF(s1, s2, 'key1'='iotdb', 'key2'='123.45') FROM root.sg.d; +``` + +Usage: + +``` java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + String stringValue = parameters.getString("key1"); // iotdb + Float floatValue = parameters.getFloat("key2"); // 123.45 + Double doubleValue = parameters.getDouble("key3"); // null + int intValue = parameters.getIntOrDefault("key4", 678); // 678 + // do something + + // configurations + // ... +} +``` + + +2.2 **UDTFConfigurations** + +You must use `UDTFConfigurations` to specify the strategy used by UDF to access raw data and the type of output sequence. + +Usage: + +``` java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + // parameters + // ... + + // configurations + configurations + .setAccessStrategy(new RowByRowAccessStrategy()) + .setOutputDataType(Type.INT32); +} +``` + +The `setAccessStrategy` method is used to set the UDF's strategy for accessing the raw data, and the `setOutputDataType` method is used to set the data type of the output sequence. + + 2.2.1 **setAccessStrategy** + + +Note that the raw data access strategy you set here determines which `transform` method the framework will call. Please implement the `transform` method corresponding to the raw data access strategy. Of course, you can also dynamically decide which strategy to set based on the attribute parameters parsed by `UDFParameters`. Therefore, two `transform` methods are also allowed to be implemented in one UDF. + +The following are the strategies you can set: + +| Interface definition | Description | The `transform` Method to Call | +| :-------------------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ | +| MappableRowByRow | Custom scalar function
The framework will call the `transform` method once for each row of raw data input, with k columns of time series and 1 row of data input, and 1 column of time series and 1 row of data output. It can be used in any clause and expression where scalar functions appear, such as select clauses, where clauses, etc. | void transform(Column[] columns, ColumnBuilder builder) throws ExceptionObject transform(Row row) throws Exception | +| RowByRowAccessStrategy | Customize time series generation function to process raw data line by line.
The framework will call the `transform` method once for each row of raw data input, inputting k columns of time series and 1 row of data, and outputting 1 column of time series and n rows of data.
When a sequence is input, the row serves as a data point for the input sequence.
When multiple sequences are input, after aligning the input sequences in time, each row serves as a data point for the input sequence.
(In a row of data, there may be a column with a `null` value, but not all columns are `null`) | void transform(Row row, PointCollector collector) throws Exception | +| SlidingTimeWindowAccessStrategy | Customize time series generation functions to process raw data in a sliding time window manner.
The framework will call the `transform` method once for each raw data input window, input k columns of time series m rows of data, and output 1 column of time series n rows of data.
A window may contain multiple rows of data, and after aligning the input sequence in time, each window serves as a data point for the input sequence.
(Each window may have i rows, and each row of data may have a column with a `null` value, but not all of them are `null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| SlidingSizeWindowAccessStrategy | Customize the time series generation function to process raw data in a fixed number of rows, meaning that each data processing window will contain a fixed number of rows of data (except for the last window).
The framework will call the `transform` method once for each raw data input window, input k columns of time series m rows of data, and output 1 column of time series n rows of data.
A window may contain multiple rows of data, and after aligning the input sequence in time, each window serves as a data point for the input sequence.
(Each window may have i rows, and each row of data may have a column with a `null` value, but not all of them are `null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| SessionTimeWindowAccessStrategy | Customize time series generation functions to process raw data in a session window format.
The framework will call the `transform` method once for each raw data input window, input k columns of time series m rows of data, and output 1 column of time series n rows of data.
A window may contain multiple rows of data, and after aligning the input sequence in time, each window serves as a data point for the input sequence.
(Each window may have i rows, and each row of data may have a column with a `null` value, but not all of them are `null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| StateWindowAccessStrategy | Customize time series generation functions to process raw data in a state window format.
he framework will call the `transform` method once for each raw data input window, inputting 1 column of time series m rows of data and outputting 1 column of time series n rows of data.
A window may contain multiple rows of data, and currently only supports opening windows for one physical quantity, which is one column of data. | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | + + +#### Interface Description: + +- `RowByRowAccessStrategy`: The construction of `RowByRowAccessStrategy` does not require any parameters. + +- `SlidingTimeWindowAccessStrategy` + +Window opening diagram: + + + +`SlidingTimeWindowAccessStrategy`: `SlidingTimeWindowAccessStrategy` has many constructors, you can pass 3 types of parameters to them: + +- Parameter 1: The display window on the time axis + +The first type of parameters are optional. If the parameters are not provided, the beginning time of the display window will be set to the same as the minimum timestamp of the query result set, and the ending time of the display window will be set to the same as the maximum timestamp of the query result set. + +- Parameter 2: Time interval for dividing the time axis (should be positive) +- Parameter 3: Time sliding step (not required to be greater than or equal to the time interval, but must be a positive number) + +The sliding step parameter is also optional. If the parameter is not provided, the sliding step will be set to the same as the time interval for dividing the time axis. + +The relationship between the three types of parameters can be seen in the figure below. Please see the Javadoc for more details. + +
+ +> Note that the actual time interval of some of the last time windows may be less than the specified time interval parameter. In addition, there may be cases where the number of data rows in some time windows is 0. In these cases, the framework will also call the `transform` method for the empty windows. + +- `SlidingSizeWindowAccessStrategy` + +Window opening diagram: + + + +`SlidingSizeWindowAccessStrategy`: `SlidingSizeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: + +* Parameter 1: Window size. This parameter specifies the number of data rows contained in a data processing window. Note that the number of data rows in some of the last time windows may be less than the specified number of data rows. +* Parameter 2: Sliding step. This parameter means the number of rows between the first point of the next window and the first point of the current window. (This parameter is not required to be greater than or equal to the window size, but must be a positive number) + +The sliding step parameter is optional. If the parameter is not provided, the sliding step will be set to the same as the window size. + +- `SessionTimeWindowAccessStrategy` + +Window opening diagram: **Time intervals less than or equal to the given minimum time interval `sessionGap` are assigned in one group.** + + + +`SessionTimeWindowAccessStrategy`: `SessionTimeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: + +- Parameter 1: The display window on the time axis. +- Parameter 2: The minimum time interval `sessionGap` of two adjacent windows. + +- `StateWindowAccessStrategy` + +Window opening diagram: **For numerical data, if the state difference is less than or equal to the given threshold `delta`, it will be assigned in one group.** + + + +`StateWindowAccessStrategy` has four constructors. + +- Constructor 1: For numerical data, there are 3 parameters: the time axis can display the start and end time of the time window and the threshold `delta` for the allowable change within a single window. +- Constructor 2: For text data and boolean data, there are 3 parameters: the time axis can be provided to display the start and end time of the time window. For both data types, the data within a single window is same, and there is no need to provide an allowable change threshold. +- Constructor 3: For numerical data, there are 1 parameters: you can only provide the threshold delta that is allowed to change within a single window. The start time of the time axis display time window will be defined as the smallest timestamp in the entire query result set, and the time axis display time window end time will be defined as The largest timestamp in the entire query result set. +- Constructor 4: For text data and boolean data, you can provide no parameter. The start and end timestamps are explained in Constructor 3. + +StateWindowAccessStrategy can only take one column as input for now. + +Please see the Javadoc for more details. + + 2.2.2 **setOutputDataType** + +Note that the type of output sequence you set here determines the type of data that the `PointCollector` can actually receive in the `transform` method. The relationship between the output data type set in `setOutputDataType` and the actual data output type that `PointCollector` can receive is as follows: + +| Output Data Type Set in `setOutputDataType` | Data Type that `PointCollector` Can Receive | +| :------------------------------------------ | :----------------------------------------------------------- | +| INT32 | int | +| INT64 | long | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | boolean | +| TEXT | java.lang.String and org.apache.iotdb.udf.api.type.Binar` | + +The type of output time series of a UDTF is determined at runtime, which means that a UDTF can dynamically determine the type of output time series according to the type of input time series. +Here is a simple example: + +```java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + // do something + // ... + + configurations + .setAccessStrategy(new RowByRowAccessStrategy()) + .setOutputDataType(parameters.getDataType(0)); +} +``` + +3. **void transform(Row row, PointCollector collector) throws Exception** + +You need to implement this method when you specify the strategy of UDF to read the original data as `RowByRowAccessStrategy`. + +This method processes the raw data one row at a time. The raw data is input from `Row` and output by `PointCollector`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. + +The following is a complete UDF example that implements the `void transform(Row row, PointCollector collector) throws Exception` method. It is an adder that receives two columns of time series as input. When two data points in a row are not `null`, this UDF will output the algebraic sum of these two data points. + +``` java +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Adder implements UDTF { + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT64) + .setAccessStrategy(new RowByRowAccessStrategy()); + } + + @Override + public void transform(Row row, PointCollector collector) throws Exception { + if (row.isNull(0) || row.isNull(1)) { + return; + } + collector.putLong(row.getTime(), row.getLong(0) + row.getLong(1)); + } +} +``` + +4. **void transform(RowWindow rowWindow, PointCollector collector) throws Exception** + +You need to implement this method when you specify the strategy of UDF to read the original data as `SlidingTimeWindowAccessStrategy` or `SlidingSizeWindowAccessStrategy`. + +This method processes a batch of data in a fixed number of rows or a fixed time interval each time, and we call the container containing this batch of data a window. The raw data is input from `RowWindow` and output by `PointCollector`. `RowWindow` can help you access a batch of `Row`, it provides a set of interfaces for random access and iterative access to this batch of `Row`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. + +Below is a complete UDF example that implements the `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` method. It is a counter that receives any number of time series as input, and its function is to count and output the number of data rows in each time window within a specified time range. + +```java +import java.io.IOException; +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.access.RowWindow; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.SlidingTimeWindowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Counter implements UDTF { + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT32) + .setAccessStrategy(new SlidingTimeWindowAccessStrategy( + parameters.getLong("time_interval"), + parameters.getLong("sliding_step"), + parameters.getLong("display_window_begin"), + parameters.getLong("display_window_end"))); + } + + @Override + public void transform(RowWindow rowWindow, PointCollector collector) { + if (rowWindow.windowSize() != 0) { + collector.putInt(rowWindow.windowStartTime(), rowWindow.windowSize()); + } + } +} +``` + +5. **void terminate(PointCollector collector) throws Exception** + +In some scenarios, a UDF needs to traverse all the original data to calculate the final output data points. The `terminate` interface provides support for those scenarios. + +This method is called after all `transform` calls are executed and before the `beforeDestory` method is executed. You can implement the `transform` method to perform pure data processing (without outputting any data points), and implement the `terminate` method to output the processing results. + +The processing results need to be output by the `PointCollector`. You can output any number of data points in one `terminate` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. + +Below is a complete UDF example that implements the `void terminate(PointCollector collector) throws Exception` method. It takes one time series whose data type is `INT32` as input, and outputs the maximum value point of the series. + +```java +import java.io.IOException; +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Max implements UDTF { + + private Long time; + private int value; + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT32) + .setAccessStrategy(new RowByRowAccessStrategy()); + } + + @Override + public void transform(Row row, PointCollector collector) { + if (row.isNull(0)) { + return; + } + int candidateValue = row.getInt(0); + if (time == null || value < candidateValue) { + time = row.getTime(); + value = candidateValue; + } + } + + @Override + public void terminate(PointCollector collector) throws IOException { + if (time != null) { + collector.putInt(time, value); + } + } +} +``` + +6. **void beforeDestroy()** + +The method for terminating a UDF. + +This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. + + + +### UDAF (User Defined Aggregation Function) + +A complete definition of UDAF involves two classes, `State` and `UDAF`. + +#### State Class + +To write your own `State`, you need to implement the `org.apache.iotdb.udf.api.State` interface. + +#### Interface Description: + +| Interface Definition | Description | Required to Implement | +| -------------------------------- | ------------------------------------------------------------ | --------------------- | +| void reset() | To reset the `State` object to its initial state, you need to fill in the initial values of the fields in the `State` class within this method as if you were writing a constructor. | Required | +| byte[] serialize() | Serializes `State` to binary data. This method is used for IoTDB internal `State` passing. Note that the order of serialization must be consistent with the following deserialization methods. | Required | +| void deserialize(byte[] bytes) | Deserializes binary data to `State`. This method is used for IoTDB internal `State` passing. Note that the order of deserialization must be consistent with the serialization method above. | Required | + +#### Detailed interface introduction: + +1. **void reset()** + +This method resets the `State` to its initial state, you need to fill in the initial values of the fields in the `State` object in this method. For optimization reasons, IoTDB reuses `State` as much as possible internally, rather than creating a new `State` for each group, which would introduce unnecessary overhead. When `State` has finished updating the data in a group, this method is called to reset to the initial state as a way to process the next group. + +In the case of `State` for averaging (aka `avg`), for example, you would need the sum of the data, `sum`, and the number of entries in the data, `count`, and initialize both to 0 in the `reset()` method. + +```java +class AvgState implements State { + double sum; + + long count; + + @Override + public void reset() { + sum = 0; + count = 0; + } + + // other methods +} +``` + +2. **byte[] serialize()/void deserialize(byte[] bytes)** + +These methods serialize the `State` into binary data, and deserialize the `State` from the binary data. IoTDB, as a distributed database, involves passing data among different nodes, so you need to write these two methods to enable the passing of the State among different nodes. Note that the order of serialization and deserialization must be the consistent. + +In the case of `State` for averaging (aka `avg`), for example, you can convert the content of State to `byte[]` array and read out the content of State from `byte[]` array in any way you want, the following shows the code for serialization/deserialization using `ByteBuffer` introduced by Java8: + +```java +@Override +public byte[] serialize() { + ByteBuffer buffer = ByteBuffer.allocate(Double.BYTES + Long.BYTES); + buffer.putDouble(sum); + buffer.putLong(count); + + return buffer.array(); +} + +@Override +public void deserialize(byte[] bytes) { + ByteBuffer buffer = ByteBuffer.wrap(bytes); + sum = buffer.getDouble(); + count = buffer.getLong(); +} +``` + + + +#### UDAF Classes + +To write a UDAF, you need to implement the `org.apache.iotdb.udf.api.UDAF` interface. + +#### Interface Description: + +| Interface definition | Description | Required to Implement | +| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------- | +| void validate(UDFParameterValidator validator) throws Exception | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | +| void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception | Initialization method that invokes user-defined initialization behavior before UDAF processes the input data. Unlike UDTF, configuration is of type `UDAFConfiguration`. | Required | +| State createState() | To create a `State` object, usually just call the default constructor and modify the default initial value as needed. | Required | +| void addInput(State state, Column[] columns, BitMap bitMap) | Update `State` object according to the incoming data `Column[]` in batch, note that last column `columns[columns.length - 1]` always represents the time column. In addition, `BitMap` represents the data that has been filtered out before, you need to manually determine whether the corresponding data has been filtered out when writing this method. | Required | +| void combineState(State state, State rhs) | Merge `rhs` state into `state` state. In a distributed scenario, the same set of data may be distributed on different nodes, IoTDB generates a `State` object for the partial data on each node, and then calls this method to merge it into the complete `State`. | Required | +| void outputFinal(State state, ResultValue resultValue) | Computes the final aggregated result based on the data in `State`. Note that according to the semantics of the aggregation, only one value can be output per group. | Required | +| void beforeDestroy() | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | + +In the life cycle of a UDAF instance, the calling sequence of each method is as follows: + +1. State createState() +2. void validate(UDFParameterValidator validator) throws Exception +3. void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception +4. void addInput(State state, Column[] columns, BitMap bitMap) +5. void combineState(State state, State rhs) +6. void outputFinal(State state, ResultValue resultValue) +7. void beforeDestroy() + +Similar to UDTF, every time the framework executes a UDAF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDAF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDAF without considering the influence of concurrency and other factors. + +#### Detailed interface introduction: + + +1. **void validate(UDFParameterValidator validator) throws Exception** + +Same as UDTF, the `validate` method is used to validate the parameters entered by the user. + +In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. + +2. **void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception** + + The `beforeStart` method does the same thing as the UDAF: + +1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. +2. Set the strategy to access the raw data and set the output data type in UDAFConfigurations. +3. Create resources, such as establishing external connections, opening files, etc. + +The role of the `UDFParameters` type can be seen above. + +2.2 **UDTFConfigurations** + +The difference from UDTF is that UDAF uses `UDAFConfigurations` as the type of `configuration` object. + +Currently, this class only supports setting the type of output data. + +```java +void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { + // parameters + // ... + + // configurations + configurations + .setOutputDataType(Type.INT32); } +} +``` + +The relationship between the output type set in `setOutputDataType` and the type of data output that `ResultValue` can actually receive is as follows: + +| The output type set in `setOutputDataType` | The output type that `ResultValue` can actually receive | +| ------------------------------------------ | ------------------------------------------------------- | +| INT32 | int | +| INT64 | long | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | boolean | +| TEXT | org.apache.iotdb.udf.api.type.Binary | + +The output type of the UDAF is determined at runtime. You can dynamically determine the output sequence type based on the input type. + +Here is a simple example: + +```java +void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { + // do something + // ... + + configurations + .setOutputDataType(parameters.getDataType(0)); +} +``` + +3. **State createState()** + + +This method creates and initializes a `State` object for UDAF. Due to the limitations of the Java language, you can only call the default constructor for the `State` class. The default constructor assigns a default initial value to all the fields in the class, and if that initial value does not meet your requirements, you need to initialize them manually within this method. + +The following is an example that includes manual initialization. Suppose you want to implement an aggregate function that multiply all numbers in the group, then your initial `State` value should be set to 1, but the default constructor initializes it to 0, so you need to initialize `State` manually after calling the default constructor: + +```java +public State createState() { + MultiplyState state = new MultiplyState(); + state.result = 1; + return state; +} +``` + +4. **void addInput(State state, Column[] columns, BitMap bitMap)** + +This method updates the `State` object with the raw input data. For performance reasons, also to align with the IoTDB vectorized query engine, the raw input data is no longer a data point, but an array of columns ``Column[]``. Note that the last column (i.e. `columns[columns.length - 1]`) is always the time column, so you can also do different operations in UDAF depending on the time. + +Since the input parameter is not of a single data point type, but of multiple columns, you need to manually filter some of the data in the columns, which is why the third parameter, `BitMap`, exists. It identifies which of these columns have been filtered out, so you don't have to think about the filtered data in any case. + +Here's an example of `addInput()` that counts the number of items (aka count). It shows how you can use `BitMap` to ignore data that has been filtered out. Note that due to the limitations of the Java language, you need to do the explicit cast the `State` object from type defined in the interface to a custom `State` type at the beginning of the method, otherwise you won't be able to use the `State` object. + +```java +public void addInput(State state, Column[] columns, BitMap bitMap) { + CountState countState = (CountState) state; + + int count = columns[0].getPositionCount(); + for (int i = 0; i < count; i++) { + if (bitMap != null && !bitMap.isMarked(i)) { + continue; + } + if (!columns[0].isNull(i)) { + countState.count++; + } + } +} +``` + +5. **void combineState(State state, State rhs)** + + +This method combines two `State`s, or more precisely, updates the first `State` object with the second `State` object. IoTDB is a distributed database, and the data of the same group may be distributed on different nodes. For performance reasons, IoTDB will first aggregate some of the data on each node into `State`, and then merge the `State`s on different nodes that belong to the same group, which is what `combineState` does. + +Here's an example of `combineState()` for averaging (aka avg). Similar to `addInput`, you need to do an explicit type conversion for the two `State`s at the beginning. Also note that you are updating the value of the first `State` with the contents of the second `State`. + +```java +public void combineState(State state, State rhs) { + AvgState avgState = (AvgState) state; + AvgState avgRhs = (AvgState) rhs; + + avgState.count += avgRhs.count; + avgState.sum += avgRhs.sum; +} +``` + +6. **void outputFinal(State state, ResultValue resultValue)** + +This method works by calculating the final result from `State`. You need to access the various fields in `State`, derive the final result, and set the final result into the `ResultValue` object.IoTDB internally calls this method once at the end for each group. Note that according to the semantics of aggregation, the final result can only be one value. + +Here is another `outputFinal` example for averaging (aka avg). In addition to the forced type conversion at the beginning, you will also see a specific use of the `ResultValue` object, where the final result is set by `setXXX` (where `XXX` is the type name). + +```java +public void outputFinal(State state, ResultValue resultValue) { + AvgState avgState = (AvgState) state; + + if (avgState.count != 0) { + resultValue.setDouble(avgState.sum / avgState.count); + } else { + resultValue.setNull(); + } +} +``` + +7. **void beforeDestroy()** + + +The method for terminating a UDF. + +This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. + + +### Maven Project Example + +If you use Maven, you can build your own UDF project referring to our **udf-example** module. You can find the project [here](https://github.com/apache/iotdb/tree/master/example/udf). + + +## Contribute universal built-in UDF functions to iotdb + +This part mainly introduces how external users can contribute their own UDFs to the IoTDB community. + +#### Prerequisites + +1. UDFs must be universal. + + The "universal" mentioned here refers to: UDFs can be widely used in some scenarios. In other words, the UDF function must have reuse value and may be directly used by other users in the community. + + If you are not sure whether the UDF you want to contribute is universal, you can send an email to `dev@iotdb.apache.org` or create an issue to initiate a discussion. + +2. The UDF you are going to contribute has been well tested and can run normally in the production environment. + + +#### What you need to prepare + +1. UDF source code +2. Test cases +3. Instructions + +#### UDF Source Code + +1. Create the UDF main class and related classes in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin` or in its subfolders. +2. Register your UDF in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`. + +#### Test Cases + +At a minimum, you need to write integration tests for the UDF. + +You can add a test class in `integration-test/src/test/java/org/apache/iotdb/db/it/udf`. + + +#### Instructions + +The instructions need to include: the name and the function of the UDF, the attribute parameters that must be provided when the UDF is executed, the applicable scenarios, and the usage examples, etc. + +The instructions for use should include both Chinese and English versions. Instructions for use should be added separately in `docs/zh/UserGuide/Operation Manual/DML Data Manipulation Language.md` and `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md`. + +#### Submit a PR + +When you have prepared the UDF source code, test cases, and instructions, you are ready to submit a Pull Request (PR) on [Github](https://github.com/apache/iotdb). You can refer to our code contribution guide to submit a PR: [Development Guide](https://iotdb.apache.org/Community/Development-Guide.html). + + +After the PR review is approved and merged, your UDF has already contributed to the IoTDB community! diff --git a/src/UserGuide/Master/User-Manual/Database-Programming.md b/src/UserGuide/Master/User-Manual/Database-Programming.md index 98f097c2d..9367c865e 100644 --- a/src/UserGuide/Master/User-Manual/Database-Programming.md +++ b/src/UserGuide/Master/User-Manual/Database-Programming.md @@ -1037,889 +1037,3 @@ SELECT avg(count_s1) from root.sg_count.d; | `continuous_query_submit_thread` | The number of threads in the scheduled thread pool that submit continuous query tasks periodically | int32 | 2 | | `continuous_query_min_every_interval_in_ms` | The minimum value of the continuous query execution time interval | duration | 1000 | -## USER-DEFINED FUNCTION (UDF) - -IoTDB provides a variety of built-in functions to meet your computing needs, and you can also create user defined functions to meet more computing needs. - -This document describes how to write, register and use a UDF. - - -### UDF Types - -In IoTDB, you can expand two types of UDF: - -| UDF Class | Description | -| --------------------------------------------------- | ------------------------------------------------------------ | -| UDTF(User Defined Timeseries Generating Function) | This type of function can take **multiple** time series as input, and output **one** time series, which can have any number of data points. | -| UDAF(User Defined Aggregation Function) | Custom Aggregation Functions. This type of function can take one time series as input, and output **one** aggregated data point for each group based on the GROUP BY type. | - -### UDF Development Dependencies - -If you use [Maven](http://search.maven.org/), you can search for the development dependencies listed below from the [Maven repository](http://search.maven.org/) . Please note that you must select the same dependency version as the target IoTDB server version for development. - -``` xml - - org.apache.iotdb - udf-api - 1.0.0 - provided - -``` - -### UDTF(User Defined Timeseries Generating Function) - -To write a UDTF, you need to inherit the `org.apache.iotdb.udf.api.UDTF` class, and at least implement the `beforeStart` method and a `transform` method. - -The following table shows all the interfaces available for user implementation. - -| Interface definition | Description | Required to Implement | -| :----------------------------------------------------------- | :----------------------------------------------------------- | ----------------------------------------------------- | -| `void validate(UDFParameterValidator validator) throws Exception` | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | -| `void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception` | The initialization method to call the user-defined initialization behavior before a UDTF processes the input data. Every time a user executes a UDTF query, the framework will construct a new UDF instance, and `beforeStart` will be called. | Required | -| `void transform(Row row, PointCollector collector) throws Exception` | This method is called by the framework. This data processing method will be called when you choose to use the `RowByRowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `Row`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | -| `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | This method is called by the framework. This data processing method will be called when you choose to use the `SlidingSizeWindowAccessStrategy` or `SlidingTimeWindowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `RowWindow`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | -| `void terminate(PointCollector collector) throws Exception` | This method is called by the framework. This method will be called once after all `transform` calls have been executed. In a single UDF query, this method will and will only be called once. You need to call the data collection method provided by `collector` to determine the output data. | Optional | -| `void beforeDestroy() ` | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | - -In the life cycle of a UDTF instance, the calling sequence of each method is as follows: - -1. `void validate(UDFParameterValidator validator) throws Exception` -2. `void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception` -3. `void transform(Row row, PointCollector collector) throws Exception` or `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` -4. `void terminate(PointCollector collector) throws Exception` -5. `void beforeDestroy() ` - -Note that every time the framework executes a UDTF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDTF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDTF without considering the influence of concurrency and other factors. - -The usage of each interface will be described in detail below. - - - -#### void validate(UDFParameterValidator validator) throws Exception - -The `validate` method is used to validate the parameters entered by the user. - -In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. - -Please refer to the Javadoc for the usage of `UDFParameterValidator`. - - - -#### void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception - -This method is mainly used to customize UDTF. In this method, the user can do the following things: - -1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. -2. Set the strategy to access the raw data and set the output data type in UDTFConfigurations. -3. Create resources, such as establishing external connections, opening files, etc. - - - - -##### UDFParameters - -`UDFParameters` is used to parse UDF parameters in SQL statements (the part in parentheses after the UDF function name in SQL). The input parameters have two parts. The first part is data types of the time series that the UDF needs to process, and the second part is the key-value pair attributes for customization. Only the second part can be empty. - - -Example: - -``` sql -SELECT UDF(s1, s2, 'key1'='iotdb', 'key2'='123.45') FROM root.sg.d; -``` - -Usage: - -``` java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - String stringValue = parameters.getString("key1"); // iotdb - Float floatValue = parameters.getFloat("key2"); // 123.45 - Double doubleValue = parameters.getDouble("key3"); // null - int intValue = parameters.getIntOrDefault("key4", 678); // 678 - // do something - - // configurations - // ... -} -``` - - - -##### UDTFConfigurations - -You must use `UDTFConfigurations` to specify the strategy used by UDF to access raw data and the type of output sequence. - -Usage: - -``` java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - // parameters - // ... - - // configurations - configurations - .setAccessStrategy(new RowByRowAccessStrategy()) - .setOutputDataType(Type.INT32); -} -``` - -The `setAccessStrategy` method is used to set the UDF's strategy for accessing the raw data, and the `setOutputDataType` method is used to set the data type of the output sequence. - - - -###### setAccessStrategy - -Note that the raw data access strategy you set here determines which `transform` method the framework will call. Please implement the `transform` method corresponding to the raw data access strategy. Of course, you can also dynamically decide which strategy to set based on the attribute parameters parsed by `UDFParameters`. Therefore, two `transform` methods are also allowed to be implemented in one UDF. - -The following are the strategies you can set: - -| Interface definition | Description | The `transform` Method to Call | -| :-------------------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ | -| `RowByRowAccessStrategy` | Process raw data row by row. The framework calls the `transform` method once for each row of raw data input. When UDF has only one input sequence, a row of input is one data point in the input sequence. When UDF has multiple input sequences, one row of input is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(Row row, PointCollector collector) throws Exception` | -| `SlidingTimeWindowAccessStrategy` | Process a batch of data in a fixed time interval each time. We call the container of a data batch a window. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window, and each row is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `SlidingSizeWindowAccessStrategy` | The raw data is processed batch by batch, and each batch contains a fixed number of raw data rows (except the last batch). We call the container of a data batch a window. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window, and each row is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `SessionTimeWindowAccessStrategy` | The raw data is processed batch by batch. We call the container of a data batch a window. The time interval between each two windows is greater than or equal to the `sessionGap` given by the user. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window, and each row is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `StateWindowAccessStrategy` | The raw data is processed batch by batch. We call the container of a data batch a window. In the state window, for text type or boolean type data, each value of the point in window is equal to the value of the first point in the window, and for numerical data, the distance between each value of the point in window and the value of the first point in the window is less than the threshold `delta` given by the user. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window. Currently, we only support state window for one measurement, that is, a column of data. | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | - - -`RowByRowAccessStrategy`: The construction of `RowByRowAccessStrategy` does not require any parameters. - -The `SlidingTimeWindowAccessStrategy` is shown schematically below. - - -`SlidingTimeWindowAccessStrategy`: `SlidingTimeWindowAccessStrategy` has many constructors, you can pass 3 types of parameters to them: - -- Parameter 1: The display window on the time axis -- Parameter 2: Time interval for dividing the time axis (should be positive) -- Parameter 3: Time sliding step (not required to be greater than or equal to the time interval, but must be a positive number) - -The first type of parameters are optional. If the parameters are not provided, the beginning time of the display window will be set to the same as the minimum timestamp of the query result set, and the ending time of the display window will be set to the same as the maximum timestamp of the query result set. - -The sliding step parameter is also optional. If the parameter is not provided, the sliding step will be set to the same as the time interval for dividing the time axis. - -The relationship between the three types of parameters can be seen in the figure below. Please see the Javadoc for more details. - -
- -Note that the actual time interval of some of the last time windows may be less than the specified time interval parameter. In addition, there may be cases where the number of data rows in some time windows is 0. In these cases, the framework will also call the `transform` method for the empty windows. - -The `SlidingSizeWindowAccessStrategy` is shown schematically below. - - -`SlidingSizeWindowAccessStrategy`: `SlidingSizeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: - -* Parameter 1: Window size. This parameter specifies the number of data rows contained in a data processing window. Note that the number of data rows in some of the last time windows may be less than the specified number of data rows. -* Parameter 2: Sliding step. This parameter means the number of rows between the first point of the next window and the first point of the current window. (This parameter is not required to be greater than or equal to the window size, but must be a positive number) - -The sliding step parameter is optional. If the parameter is not provided, the sliding step will be set to the same as the window size. - -The `SessionTimeWindowAccessStrategy` is shown schematically below. **Time intervals less than or equal to the given minimum time interval `sessionGap` are assigned in one group** - - -`SessionTimeWindowAccessStrategy`: `SessionTimeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: - -- Parameter 1: The display window on the time axis. -- Parameter 2: The minimum time interval `sessionGap` of two adjacent windows. - - -The `StateWindowAccessStrategy` is shown schematically below. **For numerical data, if the state difference is less than or equal to the given threshold `delta`, it will be assigned in one group. ** - - -`StateWindowAccessStrategy` has four constructors. - -- Constructor 1: For numerical data, there are 3 parameters: the time axis can display the start and end time of the time window and the threshold `delta` for the allowable change within a single window. -- Constructor 2: For text data and boolean data, there are 3 parameters: the time axis can be provided to display the start and end time of the time window. For both data types, the data within a single window is same, and there is no need to provide an allowable change threshold. -- Constructor 3: For numerical data, there are 1 parameters: you can only provide the threshold delta that is allowed to change within a single window. The start time of the time axis display time window will be defined as the smallest timestamp in the entire query result set, and the time axis display time window end time will be defined as The largest timestamp in the entire query result set. -- Constructor 4: For text data and boolean data, you can provide no parameter. The start and end timestamps are explained in Constructor 3. - -StateWindowAccessStrategy can only take one column as input for now. - -Please see the Javadoc for more details. - - - -###### setOutputDataType - -Note that the type of output sequence you set here determines the type of data that the `PointCollector` can actually receive in the `transform` method. The relationship between the output data type set in `setOutputDataType` and the actual data output type that `PointCollector` can receive is as follows: - -| Output Data Type Set in `setOutputDataType` | Data Type that `PointCollector` Can Receive | -| :------------------------------------------ | :----------------------------------------------------------- | -| `INT32` | `int` | -| `INT64` | `long` | -| `FLOAT` | `float` | -| `DOUBLE` | `double` | -| `BOOLEAN` | `boolean` | -| `TEXT` | `java.lang.String` and `org.apache.iotdb.udf.api.type.Binary` | - -The type of output time series of a UDTF is determined at runtime, which means that a UDTF can dynamically determine the type of output time series according to the type of input time series. -Here is a simple example: - -```java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - // do something - // ... - - configurations - .setAccessStrategy(new RowByRowAccessStrategy()) - .setOutputDataType(parameters.getDataType(0)); -} -``` - - - -#### void transform(Row row, PointCollector collector) throws Exception - -You need to implement this method when you specify the strategy of UDF to read the original data as `RowByRowAccessStrategy`. - -This method processes the raw data one row at a time. The raw data is input from `Row` and output by `PointCollector`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. - -The following is a complete UDF example that implements the `void transform(Row row, PointCollector collector) throws Exception` method. It is an adder that receives two columns of time series as input. When two data points in a row are not `null`, this UDF will output the algebraic sum of these two data points. - -``` java -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Adder implements UDTF { - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT64) - .setAccessStrategy(new RowByRowAccessStrategy()); - } - - @Override - public void transform(Row row, PointCollector collector) throws Exception { - if (row.isNull(0) || row.isNull(1)) { - return; - } - collector.putLong(row.getTime(), row.getLong(0) + row.getLong(1)); - } -} -``` - - - -#### void transform(RowWindow rowWindow, PointCollector collector) throws Exception - -You need to implement this method when you specify the strategy of UDF to read the original data as `SlidingTimeWindowAccessStrategy` or `SlidingSizeWindowAccessStrategy`. - -This method processes a batch of data in a fixed number of rows or a fixed time interval each time, and we call the container containing this batch of data a window. The raw data is input from `RowWindow` and output by `PointCollector`. `RowWindow` can help you access a batch of `Row`, it provides a set of interfaces for random access and iterative access to this batch of `Row`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. - -Below is a complete UDF example that implements the `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` method. It is a counter that receives any number of time series as input, and its function is to count and output the number of data rows in each time window within a specified time range. - -```java -import java.io.IOException; -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.access.RowWindow; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.SlidingTimeWindowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Counter implements UDTF { - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT32) - .setAccessStrategy(new SlidingTimeWindowAccessStrategy( - parameters.getLong("time_interval"), - parameters.getLong("sliding_step"), - parameters.getLong("display_window_begin"), - parameters.getLong("display_window_end"))); - } - - @Override - public void transform(RowWindow rowWindow, PointCollector collector) { - if (rowWindow.windowSize() != 0) { - collector.putInt(rowWindow.windowStartTime(), rowWindow.windowSize()); - } - } -} -``` - - - -#### void terminate(PointCollector collector) throws Exception - -In some scenarios, a UDF needs to traverse all the original data to calculate the final output data points. The `terminate` interface provides support for those scenarios. - -This method is called after all `transform` calls are executed and before the `beforeDestory` method is executed. You can implement the `transform` method to perform pure data processing (without outputting any data points), and implement the `terminate` method to output the processing results. - -The processing results need to be output by the `PointCollector`. You can output any number of data points in one `terminate` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. - -Below is a complete UDF example that implements the `void terminate(PointCollector collector) throws Exception` method. It takes one time series whose data type is `INT32` as input, and outputs the maximum value point of the series. - -```java -import java.io.IOException; -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Max implements UDTF { - - private Long time; - private int value; - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT32) - .setAccessStrategy(new RowByRowAccessStrategy()); - } - - @Override - public void transform(Row row, PointCollector collector) { - if (row.isNull(0)) { - return; - } - int candidateValue = row.getInt(0); - if (time == null || value < candidateValue) { - time = row.getTime(); - value = candidateValue; - } - } - - @Override - public void terminate(PointCollector collector) throws IOException { - if (time != null) { - collector.putInt(time, value); - } - } -} -``` - - - -#### void beforeDestroy() - -The method for terminating a UDF. - -This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. - - - -### UDAF (User Defined Aggregation Function) - -A complete definition of UDAF involves two classes, `State` and `UDAF`. - -#### State Class - -To write your own `State`, you need to implement the `org.apache.iotdb.udf.api.State` interface. - -The following table shows all the interfaces available for user implementation. - -| Interface Definition | Description | Required to Implement | -| -------------------------------- | ------------------------------------------------------------ | --------------------- | -| `void reset()` | To reset the `State` object to its initial state, you need to fill in the initial values of the fields in the `State` class within this method as if you were writing a constructor. | Required | -| `byte[] serialize()` | Serializes `State` to binary data. This method is used for IoTDB internal `State` passing. Note that the order of serialization must be consistent with the following deserialization methods. | Required | -| `void deserialize(byte[] bytes)` | Deserializes binary data to `State`. This method is used for IoTDB internal `State` passing. Note that the order of deserialization must be consistent with the serialization method above. | Required | - -The following section describes the usage of each interface in detail. - - - -##### void reset() - -This method resets the `State` to its initial state, you need to fill in the initial values of the fields in the `State` object in this method. For optimization reasons, IoTDB reuses `State` as much as possible internally, rather than creating a new `State` for each group, which would introduce unnecessary overhead. When `State` has finished updating the data in a group, this method is called to reset to the initial state as a way to process the next group. - -In the case of `State` for averaging (aka `avg`), for example, you would need the sum of the data, `sum`, and the number of entries in the data, `count`, and initialize both to 0 in the `reset()` method. - -```java -class AvgState implements State { - double sum; - - long count; - - @Override - public void reset() { - sum = 0; - count = 0; - } - - // other methods -} -``` - - - -##### byte[] serialize()/void deserialize(byte[] bytes) - -These methods serialize the `State` into binary data, and deserialize the `State` from the binary data. IoTDB, as a distributed database, involves passing data among different nodes, so you need to write these two methods to enable the passing of the State among different nodes. Note that the order of serialization and deserialization must be the consistent. - -In the case of `State` for averaging (aka `avg`), for example, you can convert the content of State to `byte[]` array and read out the content of State from `byte[]` array in any way you want, the following shows the code for serialization/deserialization using `ByteBuffer` introduced by Java8: - -```java -@Override -public byte[] serialize() { - ByteBuffer buffer = ByteBuffer.allocate(Double.BYTES + Long.BYTES); - buffer.putDouble(sum); - buffer.putLong(count); - - return buffer.array(); -} - -@Override -public void deserialize(byte[] bytes) { - ByteBuffer buffer = ByteBuffer.wrap(bytes); - sum = buffer.getDouble(); - count = buffer.getLong(); -} -``` - - - -#### UDAF Classes - -To write a UDAF, you need to implement the `org.apache.iotdb.udf.api.UDAF` interface. - -The following table shows all the interfaces available for user implementation. - -| Interface definition | Description | Required to Implement | -| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------- | -| `void validate(UDFParameterValidator validator) throws Exception` | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | -| `void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception` | Initialization method that invokes user-defined initialization behavior before UDAF processes the input data. Unlike UDTF, configuration is of type `UDAFConfiguration`. | Required | -| `State createState()` | To create a `State` object, usually just call the default constructor and modify the default initial value as needed. | Required | -| `void addInput(State state, Column[] columns, BitMap bitMap)` | Update `State` object according to the incoming data `Column[]` in batch, note that last column `columns[columns.length - 1]` always represents the time column. In addition, `BitMap` represents the data that has been filtered out before, you need to manually determine whether the corresponding data has been filtered out when writing this method. | Required | -| `void combineState(State state, State rhs)` | Merge `rhs` state into `state` state. In a distributed scenario, the same set of data may be distributed on different nodes, IoTDB generates a `State` object for the partial data on each node, and then calls this method to merge it into the complete `State`. | Required | -| `void outputFinal(State state, ResultValue resultValue)` | Computes the final aggregated result based on the data in `State`. Note that according to the semantics of the aggregation, only one value can be output per group. | Required | -| `void beforeDestroy() ` | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | - -In the life cycle of a UDAF instance, the calling sequence of each method is as follows: - -1. `State createState()` -2. `void validate(UDFParameterValidator validator) throws Exception` -3. `void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception` -4. `void addInput(State state, Column[] columns, BitMap bitMap)` -5. `void combineState(State state, State rhs)` -6. `void outputFinal(State state, ResultValue resultValue)` -7. `void beforeDestroy()` - -Similar to UDTF, every time the framework executes a UDAF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDAF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDAF without considering the influence of concurrency and other factors. - -The usage of each interface will be described in detail below. - - - -##### void validate(UDFParameterValidator validator) throws Exception - -Same as UDTF, the `validate` method is used to validate the parameters entered by the user. - -In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. - - - -##### void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception - - The `beforeStart` method does the same thing as the UDAF: - -1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. -2. Set the strategy to access the raw data and set the output data type in UDAFConfigurations. -3. Create resources, such as establishing external connections, opening files, etc. - -The role of the `UDFParameters` type can be seen above. - -###### UDAFConfigurations - -The difference from UDTF is that UDAF uses `UDAFConfigurations` as the type of `configuration` object. - -Currently, this class only supports setting the type of output data. - -```java -void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { - // parameters - // ... - - // configurations - configurations - .setOutputDataType(Type.INT32); } -} -``` - -The relationship between the output type set in `setOutputDataType` and the type of data output that `ResultValue` can actually receive is as follows: - -| The output type set in `setOutputDataType` | The output type that `ResultValue` can actually receive | -| ------------------------------------------ | ------------------------------------------------------- | -| `INT32` | `int` | -| `INT64` | `long` | -| `FLOAT` | `float` | -| `DOUBLE` | `double` | -| `BOOLEAN` | `boolean` | -| `TEXT` | `org.apache.iotdb.udf.api.type.Binary` | - -The output type of the UDAF is determined at runtime. You can dynamically determine the output sequence type based on the input type. - -Here is a simple example: - -```java -void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { - // do something - // ... - - configurations - .setOutputDataType(parameters.getDataType(0)); -} -``` - - - -##### State createState() - -This method creates and initializes a `State` object for UDAF. Due to the limitations of the Java language, you can only call the default constructor for the `State` class. The default constructor assigns a default initial value to all the fields in the class, and if that initial value does not meet your requirements, you need to initialize them manually within this method. - -The following is an example that includes manual initialization. Suppose you want to implement an aggregate function that multiply all numbers in the group, then your initial `State` value should be set to 1, but the default constructor initializes it to 0, so you need to initialize `State` manually after calling the default constructor: - -```java -public State createState() { - MultiplyState state = new MultiplyState(); - state.result = 1; - return state; -} -``` - - - -##### void addInput(State state, Column[] columns, BitMap bitMap) - -This method updates the `State` object with the raw input data. For performance reasons, also to align with the IoTDB vectorized query engine, the raw input data is no longer a data point, but an array of columns ``Column[]``. Note that the last column (i.e. `columns[columns.length - 1]`) is always the time column, so you can also do different operations in UDAF depending on the time. - -Since the input parameter is not of a single data point type, but of multiple columns, you need to manually filter some of the data in the columns, which is why the third parameter, `BitMap`, exists. It identifies which of these columns have been filtered out, so you don't have to think about the filtered data in any case. - -Here's an example of `addInput()` that counts the number of items (aka count). It shows how you can use `BitMap` to ignore data that has been filtered out. Note that due to the limitations of the Java language, you need to do the explicit cast the `State` object from type defined in the interface to a custom `State` type at the beginning of the method, otherwise you won't be able to use the `State` object. - -```java -public void addInput(State state, Column[] columns, BitMap bitMap) { - CountState countState = (CountState) state; - - int count = columns[0].getPositionCount(); - for (int i = 0; i < count; i++) { - if (bitMap != null && !bitMap.isMarked(i)) { - continue; - } - if (!columns[0].isNull(i)) { - countState.count++; - } - } -} -``` - - - -##### void combineState(State state, State rhs) - -This method combines two `State`s, or more precisely, updates the first `State` object with the second `State` object. IoTDB is a distributed database, and the data of the same group may be distributed on different nodes. For performance reasons, IoTDB will first aggregate some of the data on each node into `State`, and then merge the `State`s on different nodes that belong to the same group, which is what `combineState` does. - -Here's an example of `combineState()` for averaging (aka avg). Similar to `addInput`, you need to do an explicit type conversion for the two `State`s at the beginning. Also note that you are updating the value of the first `State` with the contents of the second `State`. - -```java -public void combineState(State state, State rhs) { - AvgState avgState = (AvgState) state; - AvgState avgRhs = (AvgState) rhs; - - avgState.count += avgRhs.count; - avgState.sum += avgRhs.sum; -} -``` - - - -##### void outputFinal(State state, ResultValue resultValue) - -This method works by calculating the final result from `State`. You need to access the various fields in `State`, derive the final result, and set the final result into the `ResultValue` object.IoTDB internally calls this method once at the end for each group. Note that according to the semantics of aggregation, the final result can only be one value. - -Here is another `outputFinal` example for averaging (aka avg). In addition to the forced type conversion at the beginning, you will also see a specific use of the `ResultValue` object, where the final result is set by `setXXX` (where `XXX` is the type name). - -```java -public void outputFinal(State state, ResultValue resultValue) { - AvgState avgState = (AvgState) state; - - if (avgState.count != 0) { - resultValue.setDouble(avgState.sum / avgState.count); - } else { - resultValue.setNull(); - } -} -``` - - - -##### void beforeDestroy() - -The method for terminating a UDF. - -This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. - - - -### Maven Project Example - -If you use Maven, you can build your own UDF project referring to our **udf-example** module. You can find the project [here](https://github.com/apache/iotdb/tree/master/example/udf). - - - -### UDF Registration - -The process of registering a UDF in IoTDB is as follows: - -1. Implement a complete UDF class, assuming the full class name of this class is `org.apache.iotdb.udf.ExampleUDTF`. -2. Package your project into a JAR. If you use Maven to manage your project, you can refer to the Maven project example above. -3. Make preparations for registration according to the registration mode. For details, see the following example. -4. You can use following SQL to register UDF. - -```sql -CREATE FUNCTION AS (USING URI URI-STRING)? -``` - -#### Example: register UDF named `example`, you can choose either of the following two registration methods - -##### No URI - -Prepare: -When use this method to register,you should put JAR to directory `iotdb-server-1.0.0-all-bin/ext/udf`(directory can config). -**Note,you should put JAR to this directory of all DataNodes if using Cluster** - -SQL: - -```sql -CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' -``` - -##### Using URI - -Prepare: -When use this method to register,you need to upload the JAR to URI server and ensure the IoTDB instance executing this registration statement has access to the URI server. -**Note,you needn't place JAR manually,IoTDB will download the JAR and sync it.** - -SQL: - -```sql -CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' -``` - -#### Note - -Since UDF instances are dynamically loaded through reflection technology, you do not need to restart the server during the UDF registration process. - -UDF function names are not case-sensitive. - -Please ensure that the function name given to the UDF is different from all built-in function names. A UDF with the same name as a built-in function cannot be registered. - -We recommend that you do not use classes that have the same class name but different function logic in different JAR packages. For example, in `UDF(UDAF/UDTF): udf1, udf2`, the JAR package of udf1 is `udf1.jar` and the JAR package of udf2 is `udf2.jar`. Assume that both JAR packages contain the `org.apache.iotdb.udf.ExampleUDTF` class. If you use two UDFs in the same SQL statement at the same time, the system will randomly load either of them and may cause inconsistency in UDF execution behavior. - -### UDF Deregistration - -The following shows the SQL syntax of how to deregister a UDF. - -```sql -DROP FUNCTION -``` - -Here is an example: - -```sql -DROP FUNCTION example -``` - - - -### UDF Queries - -The usage of UDF is similar to that of built-in aggregation functions. - - - -#### Basic SQL syntax support - -* Support `SLIMIT` / `SOFFSET` -* Support `LIMIT` / `OFFSET` -* Support queries with time filters -* Support queries with value filters - - -#### Queries with * in SELECT Clauses - -Assume that there are 2 time series (`root.sg.d1.s1` and `root.sg.d1.s2`) in the system. - -* **`SELECT example(*) from root.sg.d1`** - -Then the result set will include the results of `example (root.sg.d1.s1)` and `example (root.sg.d1.s2)`. - -* **`SELECT example(s1, *) from root.sg.d1`** - -Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)` and `example(root.sg.d1.s1, root.sg.d1.s2)`. - -* **`SELECT example(*, *) from root.sg.d1`** - -Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)`, `example(root.sg.d1.s2, root.sg.d1.s1)`, `example(root.sg.d1.s1, root.sg.d1.s2)` and `example(root.sg.d1.s2, root.sg.d1.s2)`. - - - -#### Queries with Key-value Attributes in UDF Parameters - -You can pass any number of key-value pair parameters to the UDF when constructing a UDF query. The key and value in the key-value pair need to be enclosed in single or double quotes. Note that key-value pair parameters can only be passed in after all time series have been passed in. Here is a set of examples: - -``` sql -SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; -SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; -``` - - - -#### Nested Queries - -``` sql -SELECT s1, s2, example(s1, s2) FROM root.sg.d1; -SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; -SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; -SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; -``` - - - -### Show All Registered UDFs - -``` sql -SHOW FUNCTIONS -``` - - - -### User Permission Management - -There are 1 types of user permissions related to UDF: `USE_UDF` - -* Only users with this permission are allowed to register UDFs -* Only users with this permission are allowed to deregister UDFs -* Only users with this permission are allowed to use UDFs for queries - -For more user permissions related content, please refer to [Account Management Statements](./Authority-Management.md). - - - -### Configurable Properties - -You can use `udf_lib_dir` to config udf lib directory. -When querying by a UDF, IoTDB may prompt that there is insufficient memory. You can resolve the issue by configuring `udf_initial_byte_array_length_for_memory_control`, `udf_memory_budget_in_mb` and `udf_reader_transformer_collector_memory_proportion` in `iotdb-system.properties` and restarting the server. - - - -### Contribute UDF - - - -This part mainly introduces how external users can contribute their own UDFs to the IoTDB community. - - - -#### Prerequisites - -1. UDFs must be universal. - - The "universal" mentioned here refers to: UDFs can be widely used in some scenarios. In other words, the UDF function must have reuse value and may be directly used by other users in the community. - - If you are not sure whether the UDF you want to contribute is universal, you can send an email to `dev@iotdb.apache.org` or create an issue to initiate a discussion. - -2. The UDF you are going to contribute has been well tested and can run normally in the production environment. - - - -#### What you need to prepare - -1. UDF source code -2. Test cases -3. Instructions - - - -##### UDF Source Code - -1. Create the UDF main class and related classes in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin` or in its subfolders. -2. Register your UDF in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`. - - - -##### Test Cases - -At a minimum, you need to write integration tests for the UDF. - -You can add a test class in `integration-test/src/test/java/org/apache/iotdb/db/it/udf`. - - - -##### Instructions - -The instructions need to include: the name and the function of the UDF, the attribute parameters that must be provided when the UDF is executed, the applicable scenarios, and the usage examples, etc. - -The instructions should be added in `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md`. - - - -#### Submit a PR - -When you have prepared the UDF source code, test cases, and instructions, you are ready to submit a Pull Request (PR) on [Github](https://github.com/apache/iotdb). You can refer to our code contribution guide to submit a PR: [Development Guide](https://iotdb.apache.org/Community/Development-Guide.html). - -### Known Implementations - -#### Built-in UDF - -1. Aggregate Functions, such as `SUM`. For details and examples, see the document [Aggregate Functions](../Reference/Function-and-Expression.md#aggregate-functions). -2. Arithmetic Functions, such as `SIN`. For details and examples, see the document [Arithmetic Operators and Functions](../Reference/Function-and-Expression.md#arithmetic-operators-and-functions). -3. Comparison Functions, such as `ON_OFF`. For details and examples, see the document [Comparison Operators and Functions](../Reference/Function-and-Expression.md#comparison-operators-and-functions). -4. String Processing Functions, such as `STRING_CONTAINS`. For details and examples, see the document [String Processing](../Reference/Function-and-Expression.md#string-processing). -5. Data Type Conversion Function, such as `CAST`. For details and examples, see the document [Data Type Conversion Function](../Reference/Function-and-Expression.md#data-type-conversion-function). -6. Constant Timeseries Generating Functions, such as `CONST`. For details and examples, see the document [Constant Timeseries Generating Functions](../Reference/Function-and-Expression.md#constant-timeseries-generating-functions). -7. Selector Functions, such as `TOP_K`. For details and examples, see the document [Selector Functions](../Reference/Function-and-Expression.md#selector-functions). -8. Continuous Interval Functions, such as `ZERO_DURATION`. For details and examples, see the document [Continuous Interval Functions](../Reference/Function-and-Expression.md#continuous-interval-functions). -9. Variation Trend Calculation Functions, such as `TIME_DIFFERENCE`. For details and examples, see the document [Variation Trend Calculation Functions](../Reference/Function-and-Expression.md#variation-trend-calculation-functions). -10. Sample Functions, such as `M4`. For details and examples, see the document [Sample Functions](../Reference/Function-and-Expression.md#sample-functions). -11. Change Points Function, such as `CHANGE_POINTS`. For details and examples, see the document [Time-Series](../Reference/Function-and-Expression.md#time-series-processing). - -#### Data Quality Function Library - -##### About - -For applications based on time series data, data quality is vital. **UDF Library** is IoTDB User Defined Functions (UDF) about data quality, including data profiling, data quality evalution and data repairing. It effectively meets the demand for data quality in the industrial field. - -##### Quick Start - -The functions in this function library are not built-in functions, and must be loaded into the system before use. - -1. [Download](https://archive.apache.org/dist/iotdb/1.0.1/apache-iotdb-1.0.1-library-udf-bin.zip) the JAR with all dependencies and the script of registering UDF. -2. Copy the JAR package to `ext\udf` under the directory of IoTDB system (Please put JAR to this directory of all DataNodes if you use Cluster). -3. Run `sbin\start-server.bat` (for Windows) or `sbin\start-server.sh` (for Linux or MacOS) to start IoTDB server. -4. Copy the script to the directory of IoTDB system (under the root directory, at the same level as `sbin`), modify the parameters in the script if needed and run it to register UDF. - -##### Implemented Functions - -1. Data Quality related functions, such as `Completeness`. For details and examples, see the document [Data-Quality](../Reference/UDF-Libraries.md#data-quality). -2. Data Profiling related functions, such as `ACF`. For details and examples, see the document [Data-Profiling](../Reference/UDF-Libraries.md#data-profiling). -3. Anomaly Detection related functions, such as `IQR`. For details and examples, see the document [Anomaly-Detection](../Reference/UDF-Libraries.md#anomaly-detection). -4. Frequency Domain Analysis related functions, such as `Conv`. For details and examples, see the document [Frequency-Domain](../Reference/UDF-Libraries.md#frequency-domain-analysis). -5. Data Matching related functions, such as `DTW`. For details and examples, see the document [Data-Matching](../Reference/UDF-Libraries.md#data-matching). -6. Data Repairing related functions, such as `TimestampRepair`. For details and examples, see the document [Data-Repairing](../Reference/UDF-Libraries.md#data-repairing). -7. Series Discovery related functions, such as `ConsecutiveSequences`. For details and examples, see the document [Series-Discovery](../Reference/UDF-Libraries.md#series-discovery). -8. Machine Learning related functions, such as `AR`. For details and examples, see the document [Machine-Learning](../Reference/UDF-Libraries.md#machine-learning). - - -### Q&A - -Q1: How to modify the registered UDF? - -A1: Assume that the name of the UDF is `example` and the full class name is `org.apache.iotdb.udf.ExampleUDTF`, which is introduced by `example.jar`. - -1. Unload the registered function by executing `DROP FUNCTION example`. -2. Delete `example.jar` under `iotdb-server-1.0.0-all-bin/ext/udf`. -3. Modify the logic in `org.apache.iotdb.udf.ExampleUDTF` and repackage it. The name of the JAR package can still be `example.jar`. -4. Upload the new JAR package to `iotdb-server-1.0.0-all-bin/ext/udf`. -5. Load the new UDF by executing `CREATE FUNCTION example AS "org.apache.iotdb.udf.ExampleUDTF"`. diff --git a/src/UserGuide/Master/User-Manual/User-defined-function.md b/src/UserGuide/Master/User-Manual/User-defined-function.md new file mode 100644 index 000000000..6ed0687a4 --- /dev/null +++ b/src/UserGuide/Master/User-Manual/User-defined-function.md @@ -0,0 +1,213 @@ +# USER-DEFINED FUNCTION (UDF) + +## 1. UDF Introduction + +UDF (User Defined Function) refers to user-defined functions. IoTDB provides a variety of built-in time series processing functions and also supports extending custom functions to meet more computing needs. + +In IoTDB, you can expand two types of UDF: + + + + + + + + + + + + + + + + + + + + + +
UDF ClassAccessStrategyDescription
UDTFMAPPABLE_ROW_BY_ROWCustom scalar function, input k columns of time series and 1 row of data, output 1 column of time series and 1 row of data, can be used in any clause and expression that appears in the scalar function, such as select clause, where clause, etc.
ROW_BY_ROW
SLIDING_TIME_WINDOW
SLIDING_SIZE_WINDOW
SESSION_TIME_WINDOW
STATE_WINDOW
Custom time series generation function, input k columns of time series m rows of data, output 1 column of time series n rows of data, the number of input rows m can be different from the number of output rows n, and can only be used in SELECT clauses.
UDAF-Custom aggregation function, input k columns of time series m rows of data, output 1 column of time series 1 row of data, can be used in any clause and expression that appears in the aggregation function, such as select clause, having clause, etc.
+ +### 1.1 UDF usage + +The usage of UDF is similar to that of regular built-in functions, and can be directly used in SELECT statements like calling regular functions. + +#### 1.Basic SQL syntax support + +* Support `SLIMIT` / `SOFFSET` +* Support `LIMIT` / `OFFSET` +* Support queries with value filters +* Support queries with time filters + + +#### 2. Queries with * in SELECT Clauses + +Assume that there are 2 time series (`root.sg.d1.s1` and `root.sg.d1.s2`) in the system. + +* **`SELECT example(*) from root.sg.d1`** + +Then the result set will include the results of `example (root.sg.d1.s1)` and `example (root.sg.d1.s2)`. + +* **`SELECT example(s1, *) from root.sg.d1`** + +Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)` and `example(root.sg.d1.s1, root.sg.d1.s2)`. + +* **`SELECT example(*, *) from root.sg.d1`** + +Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)`, `example(root.sg.d1.s2, root.sg.d1.s1)`, `example(root.sg.d1.s1, root.sg.d1.s2)` and `example(root.sg.d1.s2, root.sg.d1.s2)`. + +#### 3. Queries with Key-value Attributes in UDF Parameters + +You can pass any number of key-value pair parameters to the UDF when constructing a UDF query. The key and value in the key-value pair need to be enclosed in single or double quotes. Note that key-value pair parameters can only be passed in after all time series have been passed in. Here is a set of examples: + + Example: +``` sql +SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; +SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; +``` + +#### 4. Nested Queries + + Example: +``` sql +SELECT s1, s2, example(s1, s2) FROM root.sg.d1; +SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; +SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; +SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; +``` + +## 2. UDF Development + +You can refer to UDF development:[Development Guide](../Reference/UDF-development.md) + +## 3. UDF management + +### 3.1 UDF Registration + +The process of registering a UDF in IoTDB is as follows: + +1. Implement a complete UDF class, assuming the full class name of this class is `org.apache.iotdb.udf.ExampleUDTF`. +2. Convert the project into a JAR package. If using Maven to manage the project, you can refer to the [Maven project example](https://github.com/apache/iotdb/tree/master/example/udf) above. +3. Make preparations for registration according to the registration mode. For details, see the following example. +4. You can use following SQL to register UDF. + +```sql +CREATE FUNCTION AS (USING URI URI-STRING) +``` + +#### Example: register UDF named `example`, you can choose either of the following two registration methods + +#### Method 1: Manually place the jar package + +Prepare: +When registering using this method, it is necessary to place the JAR package in advance in the 'ext/udf' directory of all DataNodes in the cluster (which can be configured). + +Registration statement: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' +``` + +#### Method 2: Cluster automatically installs jar packages through URI + +Prepare: +When registering using this method, it is necessary to upload the JAR package to the URI server in advance and ensure that the IoTDB instance executing the registration statement can access the URI server. + +Registration statement: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' +``` + +IoTDB will download JAR packages and synchronize them to the entire cluster. + +#### Note + +1. Since UDF instances are dynamically loaded through reflection technology, you do not need to restart the server during the UDF registration process. + +2. UDF function names are not case-sensitive. + +3. Please ensure that the function name given to the UDF is different from all built-in function names. A UDF with the same name as a built-in function cannot be registered. + +4. We recommend that you do not use classes that have the same class name but different function logic in different JAR packages. For example, in `UDF(UDAF/UDTF): udf1, udf2`, the JAR package of udf1 is `udf1.jar` and the JAR package of udf2 is `udf2.jar`. Assume that both JAR packages contain the `org.apache.iotdb.udf.ExampleUDTF` class. If you use two UDFs in the same SQL statement at the same time, the system will randomly load either of them and may cause inconsistency in UDF execution behavior. + +### 3.2 UDF Deregistration + +The SQL syntax is as follows: + +```sql +DROP FUNCTION +``` + +Example: Uninstall the UDF from the above example: + +```sql +DROP FUNCTION example +``` + + + +### 3.3 Show All Registered UDFs + +``` sql +SHOW FUNCTIONS +``` + +### 3.4 UDF configuration + +- UDF configuration allows configuring the storage directory of UDF in `iotdb-system.properties` + ``` Properties +# UDF lib dir + +udf_lib_dir=ext/udf +``` + +- -When using custom functions, there is a message indicating insufficient memory. Change the following configuration parameters in `iotdb-system.properties` and restart the service. + + ``` Properties + +# Used to estimate the memory usage of text fields in a UDF query. +# It is recommended to set this value to be slightly larger than the average length of all text +# effectiveMode: restart +# Datatype: int +udf_initial_byte_array_length_for_memory_control=48 + +# How much memory may be used in ONE UDF query (in MB). +# The upper limit is 20% of allocated memory for read. +# effectiveMode: restart +# Datatype: float +udf_memory_budget_in_mb=30.0 + +# UDF memory allocation ratio. +# The parameter form is a:b:c, where a, b, and c are integers. +# effectiveMode: restart +udf_reader_transformer_collector_memory_proportion=1:1:1 +``` + +### 3.5 UDF User Permissions + + +When users use UDF, they will be involved in the `USE_UDF` permission, and only users with this permission are allowed to perform UDF registration, uninstallation, and query operations. + +For more user permissions related content, please refer to [Account Management Statements](./Authority-Management.md). + + +## 4. UDF Libraries + +Based on the ability of user-defined functions, IoTDB provides a series of functions for temporal data processing, including data quality, data profiling, anomaly detection, frequency domain analysis, data matching, data repairing, sequence discovery, machine learning, etc., which can meet the needs of industrial fields for temporal data processing. + +You can refer to the [UDF Libraries](../Reference/UDF-Libraries.md)document to find the installation steps and registration statements for each function, to ensure that all required functions are registered correctly. + + +## 5. Common problem: + +Q1: How to modify the registered UDF? + +A1: Assume that the name of the UDF is `example` and the full class name is `org.apache.iotdb.udf.ExampleUDTF`, which is introduced by `example.jar`. + +1. Unload the registered function by executing `DROP FUNCTION example`. +2. Delete `example.jar` under `iotdb-server-1.0.0-all-bin/ext/udf`. +3. Modify the logic in `org.apache.iotdb.udf.ExampleUDTF` and repackage it. The name of the JAR package can still be `example.jar`. +4. Upload the new JAR package to `iotdb-server-1.0.0-all-bin/ext/udf`. +5. Load the new UDF by executing `CREATE FUNCTION example AS "org.apache.iotdb.udf.ExampleUDTF"`. + diff --git a/src/UserGuide/latest/Reference/UDF-Libraries.md b/src/UserGuide/latest/Reference/UDF-Libraries.md index 0295dfe7b..39cc2a743 100644 --- a/src/UserGuide/latest/Reference/UDF-Libraries.md +++ b/src/UserGuide/latest/Reference/UDF-Libraries.md @@ -21,10 +21,38 @@ # UDF Libraries +Based on the ability of user-defined functions, IoTDB provides a series of functions for temporal data processing, including data quality, data profiling, anomaly detection, frequency domain analysis, data matching, data repairing, sequence discovery, machine learning, etc., which can meet the needs of industrial fields for temporal data processing. + +## Installation steps + +1. Please obtain the compressed file of the UDF library JAR package that is compatible with the IoTDB version. + + | UDF libraries version | Supported IoTDB versions | Download link | + | --------------- | ----------------- | ------------------------------------------------------------ | + | UDF-1.3.3.zip | V1.3.3 and above | [UDF.zip](https://alioss.timecho.com/upload/UDF-1.3.3.zip) | + | UDF-1.3.2.zip | V1.0.0~V1.3.2 | [UDF.zip](https://alioss.timecho.com/upload/UDF-1.3.2.zip) | + +2. Place the library-udf.jar file from the obtained compressed package in the path of IoTDB at `/iotdb-enterprise-x.x.x.x-bin/ext/udf` +3. In the SQL command line terminal (CLI) or visualization console (Workbench) SQL operation interface of IoTDB, execute the corresponding function registration statement as follows. +4. Batch registration: Two registration methods: registration script or SQL full statement +- Register Script + - Copy the registration script (register-UDF.sh or register-UDF.bat) from the compressed package to the `tools` directory of IoTDB as needed, and modify the parameters in the script (default is host=127.0.0.1, rpcPort=6667, user=root, pass=root); + - Start IoTDB service, run registration script to batch register UDF + +- All SQL statements + - Open the SQl file in the compressed package, copy all SQL statements, and execute all SQl statements in the SQL command line terminal (CLI) of IoTDB or the SQL operation interface of the visualization console (Workbench) to batch register UDF + + ## Data Quality ### Completeness +#### Registration statement + +```sql +create function completeness as 'org.apache.iotdb.library.dquality.UDTFCompleteness' +``` + #### Usage This function is used to calculate the completeness of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the completeness of each window will be output. @@ -150,6 +178,12 @@ Output series: ### Consistency +#### Registration statement + +```sql +create function consistency as 'org.apache.iotdb.library.dquality.UDTFConsistency' +``` + #### Usage This function is used to calculate the consistency of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the consistency of each window will be output. @@ -274,6 +308,12 @@ Output series: ### Timeliness +#### Registration statement + +```sql +create function timeliness as 'org.apache.iotdb.library.dquality.UDTFTimeliness' +``` + #### Usage This function is used to calculate the timeliness of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the timeliness of each window will be output. @@ -398,6 +438,12 @@ Output series: ### Validity +#### Registration statement + +```sql +create function validity as 'org.apache.iotdb.library.dquality.UDTFValidity' +``` + #### Usage This function is used to calculate the Validity of time series. The input series are divided into several continuous and non overlapping windows. The timestamp of the first data point and the Validity of each window will be output. @@ -547,6 +593,12 @@ Output series: ### ACF +#### Registration statement + +```sql +create function acf as 'org.apache.iotdb.library.dprofile.UDTFACF' +``` + #### Usage This function is used to calculate the auto-correlation factor of the input time series, @@ -606,6 +658,12 @@ Output series: ### Distinct +#### Registration statement + +```sql +create function distinct as 'org.apache.iotdb.library.dprofile.UDTFDistinct' +``` + #### Usage This function returns all unique values in time series. @@ -659,6 +717,12 @@ Output series: ### Histogram +#### Registration statement + +```sql +create function histogram as 'org.apache.iotdb.library.dprofile.UDTFHistogram' +``` + #### Usage This function is used to calculate the distribution histogram of a single column of numerical data. @@ -738,6 +802,12 @@ Output series: ### Integral +#### Registration statement + +```sql +create function integral as 'org.apache.iotdb.library.dprofile.UDAFIntegral' +``` + #### Usage This function is used to calculate the integration of time series, @@ -829,6 +899,12 @@ $$\frac{1}{2\times 60}[(1+2) \times 1 + (2+5) \times 1 + (5+6) \times 1 + (6+7) ### IntegralAvg +#### Registration statement + +```sql +create function integralavg as 'org.apache.iotdb.library.dprofile.UDAFIntegralAvg' +``` + #### Usage This function is used to calculate the function average of time series. @@ -890,6 +966,12 @@ $$\frac{1}{2}[(1+2) \times 1 + (2+5) \times 1 + (5+6) \times 1 + (6+7) \times 1 ### Mad +#### Registration statement + +```sql +create function mad as 'org.apache.iotdb.library.dprofile.UDAFMad' +``` + #### Usage The function is used to compute the exact or approximate median absolute deviation (MAD) of a numeric time series. MAD is the median of the deviation of each element from the elements' median. @@ -988,6 +1070,12 @@ Output series: ### Median +#### Registration statement + +```sql +create function median as 'org.apache.iotdb.library.dprofile.UDAFMedian' +``` + #### Usage The function is used to compute the exact or approximate median of a numeric time series. Median is the value separating the higher half from the lower half of a data sample. @@ -1058,6 +1146,12 @@ Output series: ### MinMax +#### Registration statement + +```sql +create function minmax as 'org.apache.iotdb.library.dprofile.UDTFMinMax' +``` + #### Usage This function is used to standardize the input series with min-max. Minimum value is transformed to 0; maximum value is transformed to 1. @@ -1197,6 +1291,12 @@ Output series: ### MvAvg +#### Registration statement + +```sql +create function mvavg as 'org.apache.iotdb.library.dprofile.UDTFMvAvg' +``` + #### Usage This function is used to calculate moving average of input series. @@ -1277,6 +1377,12 @@ Output series: ### PACF +#### Registration statement + +```sql +create function pacf as 'org.apache.iotdb.library.dprofile.UDTFPACF' +``` + #### Usage This function is used to calculate partial autocorrelation of input series by solving Yule-Walker equation. For some cases, the equation may not be solved, and NaN will be output. @@ -1346,6 +1452,12 @@ Output series: ### Percentile +#### Registration statement + +```sql +create function percentile as 'org.apache.iotdb.library.dprofile.UDAFPercentile' +``` + #### Usage The function is used to compute the exact or approximate percentile of a numeric time series. A percentile is value of element in the certain rank of the sorted series. @@ -1419,6 +1531,12 @@ Output series: ### Quantile +#### Registration statement + +```sql +create function quantile as 'org.apache.iotdb.library.dprofile.UDAFQuantile' +``` + #### Usage The function is used to compute the approximate quantile of a numeric time series. A quantile is value of element in the certain rank of the sorted series. @@ -1492,6 +1610,12 @@ Output series: ### Period +#### Registration statement + +```sql +create function period as 'org.apache.iotdb.library.dprofile.UDAFPeriod' +``` + #### Usage The function is used to compute the period of a numeric time series. @@ -1541,6 +1665,12 @@ Output series: ### QLB +#### Registration statement + +```sql +create function qlb as 'org.apache.iotdb.library.dprofile.UDTFQLB' +``` + #### Usage This function is used to calculate Ljung-Box statistics $Q_{LB}$ for time series, and convert it to p value. @@ -1625,6 +1755,12 @@ Output series: ### Resample +#### Registration statement + +```sql +create function re_sample as 'org.apache.iotdb.library.dprofile.UDTFResample' +``` + #### Usage This function is used to resample the input series according to a given frequency, @@ -1754,6 +1890,12 @@ Output series: ### Sample +#### Registration statement + +```sql +create function sample as 'org.apache.iotdb.library.dprofile.UDTFSample' +``` + #### Usage This function is used to sample the input series, @@ -1852,6 +1994,12 @@ Output series: ### Segment +#### Registration statement + +```sql +create function segment as 'org.apache.iotdb.library.dprofile.UDTFSegment' +``` + #### Usage This function is used to segment a time series into subsequences according to linear trend, and returns linear fitted values of first values in each subsequence or every data point. @@ -1944,6 +2092,12 @@ Output series: ### Skew +#### Registration statement + +```sql +create function skew as 'org.apache.iotdb.library.dprofile.UDAFSkew' +``` + #### Usage This function is used to calculate the population skewness. @@ -2005,6 +2159,12 @@ Output series: ### Spline +#### Registration statement + +```sql +create function spline as 'org.apache.iotdb.library.dprofile.UDTFSpline' +``` + #### Usage This function is used to calculate cubic spline interpolation of input series. @@ -2210,6 +2370,12 @@ Output series: ### Spread +#### Registration statement + +```sql +create function spread as 'org.apache.iotdb.library.dprofile.UDAFSpread' +``` + #### Usage This function is used to calculate the spread of time series, that is, the maximum value minus the minimum value. @@ -2327,6 +2493,12 @@ Output series: ### ZScore +#### Registration statement + +```sql +create function zscore as 'org.apache.iotdb.library.dprofile.UDTFZScore' +``` + #### Usage This function is used to standardize the input series with z-score. @@ -2433,6 +2605,12 @@ Output series: ### IQR +#### Registration statement + +```sql +create function iqr as 'org.apache.iotdb.library.anomaly.UDTFIQR' +``` + #### Usage This function is used to detect anomalies based on IQR. Points distributing beyond 1.5 times IQR are selected. @@ -2500,6 +2678,12 @@ Output series: ### KSigma +#### Registration statement + +```sql +create function ksigma as 'org.apache.iotdb.library.anomaly.UDTFKSigma' +``` + #### Usage This function is used to detect anomalies based on the Dynamic K-Sigma Algorithm. @@ -2565,6 +2749,12 @@ Output series: ### LOF +#### Registration statement + +```sql +create function LOF as 'org.apache.iotdb.library.anomaly.UDTFLOF' +``` + #### Usage This function is used to detect density anomaly of time series. According to k-th distance calculation parameter and local outlier factor (lof) threshold, the function judges if a set of input values is an density anomaly, and a bool mark of anomaly values will be output. @@ -2691,6 +2881,12 @@ Output series: ### MissDetect +#### Registration statement + +```sql +create function missdetect as 'org.apache.iotdb.library.anomaly.UDTFMissDetect' +``` + #### Usage This function is used to detect missing anomalies. @@ -2779,6 +2975,12 @@ Output series: ### Range +#### Registration statement + +```sql +create function range as 'org.apache.iotdb.library.anomaly.UDTFRange' +``` + #### Usage This function is used to detect range anomaly of time series. According to upper bound and lower bound parameters, the function judges if a input value is beyond range, aka range anomaly, and a new time series of anomaly will be output. @@ -2844,6 +3046,12 @@ Output series: ### TwoSidedFilter +#### Registration statement + +```sql +create function twosidedfilter as 'org.apache.iotdb.library.anomaly.UDTFTwoSidedFilter' +``` + #### Usage The function is used to filter anomalies of a numeric time series based on two-sided window detection. @@ -2937,6 +3145,12 @@ Output series: ### Outlier +#### Registration statement + +```sql +create function outlier as 'org.apache.iotdb.library.anomaly.UDTFOutlier' +``` + #### Usage This function is used to detect distance-based outliers. For each point in the current window, if the number of its neighbors within the distance of neighbor distance threshold is less than the neighbor count threshold, the point in detected as an outlier. @@ -3260,6 +3474,12 @@ Output series: ### Conv +#### Registration statement + +```sql +create function conv as 'org.apache.iotdb.library.frequency.UDTFConv' +``` + #### Usage This function is used to calculate the convolution, i.e. polynomial multiplication. @@ -3307,6 +3527,12 @@ Output series: ### Deconv +#### Registration statement + +```sql +create function deconv as 'org.apache.iotdb.library.frequency.UDTFDeconv' +``` + #### Usage This function is used to calculate the deconvolution, i.e. polynomial division. @@ -3387,6 +3613,12 @@ Output series: ### DWT +#### Registration statement + +```sql +create function dwt as 'org.apache.iotdb.library.frequency.UDTFDWT' +``` + #### Usage This function is used to calculate 1d discrete wavelet transform of a numerical series. @@ -3468,6 +3700,12 @@ Output series: ### FFT +#### Registration statement + +```sql +create function fft as 'org.apache.iotdb.library.frequency.UDTFFFT' +``` + #### Usage This function is used to calculate the fast Fourier transform (FFT) of a numerical series. @@ -3592,6 +3830,12 @@ The last data point is reserved to indicate the length of the series. ### HighPass +#### Registration statement + +```sql +create function highpass as 'org.apache.iotdb.library.frequency.UDTFHighPass' +``` + #### Usage This function performs low-pass filtering on the input series and extracts components above the cutoff frequency. @@ -3679,6 +3923,12 @@ Note: The input is $y=sin(2\pi t/4)+2sin(2\pi t/5)$ with a length of 20. Thus, t ### IFFT +#### Registration statement + +```sql +create function ifft as 'org.apache.iotdb.library.frequency.UDTFIFFT' +``` + #### Usage This function treats the two input series as the real and imaginary part of a complex series, performs an inverse fast Fourier transform (IFFT), and outputs the real part of the result. @@ -3756,6 +4006,12 @@ Output series: ### LowPass +#### Registration statement + +```sql +create function lowpass as 'org.apache.iotdb.library.frequency.UDTFLowPass' +``` + #### Usage This function performs low-pass filtering on the input series and extracts components below the cutoff frequency. @@ -3866,6 +4122,12 @@ Note: The input is $y=sin(2\pi t/4)+2sin(2\pi t/5)$ with a length of 20. Thus, t ### Cov +#### Registration statement + +```sql +create function cov as 'org.apache.iotdb.library.dmatch.UDAFCov' +``` + #### Usage This function is used to calculate the population covariance. @@ -3927,6 +4189,12 @@ Output series: ### DTW +#### Registration statement + +```sql +create function dtw as 'org.apache.iotdb.library.dmatch.UDAFDtw' +``` + #### Usage This function is used to calculate the DTW distance between two input series. @@ -3992,6 +4260,12 @@ Output series: ### Pearson +#### Registration statement + +```sql +create function pearson as 'org.apache.iotdb.library.dmatch.UDAFPearson' +``` + #### Usage This function is used to calculate the Pearson Correlation Coefficient. @@ -4053,6 +4327,12 @@ Output series: ### PtnSym +#### Registration statement + +```sql +create function ptnsym as 'org.apache.iotdb.library.dmatch.UDTFPtnSym' +``` + #### Usage This function is used to find all symmetric subseries in the input whose degree of symmetry is less than the threshold. @@ -4113,6 +4393,12 @@ Output series: ### XCorr +#### Registration statement + +```sql +create function xcorr as 'org.apache.iotdb.library.dmatch.UDTFXCorr' +``` + #### Usage This function is used to calculate the cross correlation function of given two time series. @@ -4202,6 +4488,14 @@ Output series: ### TimestampRepair +#### Registration statement + +```sql +create function timestamprepair as 'org.apache.iotdb.library.drepair.UDTFTimestampRepair' +``` + +#### Usage + This function is used for timestamp repair. According to the given standard time interval, the method of minimizing the repair cost is adopted. @@ -4303,6 +4597,12 @@ Output series: ### ValueFill +#### Registration statement + +```sql +create function valuefill as 'org.apache.iotdb.library.drepair.UDTFValueFill' +``` + #### Usage This function is used to impute time series. Several methods are supported. @@ -4415,6 +4715,12 @@ Output series: ### ValueRepair +#### Registration statement + +```sql +create function valuerepair as 'org.apache.iotdb.library.drepair.UDTFValueRepair' +``` + #### Usage This function is used to repair the value of the time series. @@ -4723,6 +5029,12 @@ Output series: ### ConsecutiveSequences +#### Registration statement + +```sql +create function consecutivesequences as 'org.apache.iotdb.library.series.UDTFConsecutiveSequences' +``` + #### Usage This function is used to find locally longest consecutive subsequences in strictly equispaced multidimensional data. @@ -4811,6 +5123,12 @@ Output series: ### ConsecutiveWindows +#### Registration statement + +```sql +create function consecutivewindows as 'org.apache.iotdb.library.series.UDTFConsecutiveWindows' +``` + #### Usage This function is used to find consecutive windows of specified length in strictly equispaced multidimensional data. @@ -4897,6 +5215,12 @@ Output series: ### AR +#### Registration statement + +```sql +create function ar as 'org.apache.iotdb.library.dlearn.UDTFAR' +``` + #### Usage This function is used to learn the coefficients of the autoregressive models for a time series. diff --git a/src/UserGuide/latest/Reference/UDF-development.md b/src/UserGuide/latest/Reference/UDF-development.md new file mode 100644 index 000000000..8e14f31db --- /dev/null +++ b/src/UserGuide/latest/Reference/UDF-development.md @@ -0,0 +1,646 @@ + # UDF development + +## UDF development + +### UDF Development Dependencies + +If you use [Maven](http://search.maven.org/), you can search for the development dependencies listed below from the [Maven repository](http://search.maven.org/) . Please note that you must select the same dependency version as the target IoTDB server version for development. + +``` xml + + org.apache.iotdb + udf-api + 1.0.0 + provided + +``` + +## UDTF(User Defined Timeseries Generating Function) + +To write a UDTF, you need to inherit the `org.apache.iotdb.udf.api.UDTF` class, and at least implement the `beforeStart` method and a `transform` method. + +#### Interface Description: + +| Interface definition | Description | Required to Implement | +| :----------------------------------------------------------- | :----------------------------------------------------------- | ----------------------------------------------------- | +| void validate(UDFParameterValidator validator) throws Exception | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | +| void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception | The initialization method to call the user-defined initialization behavior before a UDTF processes the input data. Every time a user executes a UDTF query, the framework will construct a new UDF instance, and `beforeStart` will be called. | Required | +| void transform(Row row, PointCollector collector) throws Exception | This method is called by the framework. This data processing method will be called when you choose to use the `RowByRowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `Row`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | +| void transform(RowWindow rowWindow, PointCollector collector) throws Exception | This method is called by the framework. This data processing method will be called when you choose to use the `SlidingSizeWindowAccessStrategy` or `SlidingTimeWindowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `RowWindow`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | +| void terminate(PointCollector collector) throws Exception | This method is called by the framework. This method will be called once after all `transform` calls have been executed. In a single UDF query, this method will and will only be called once. You need to call the data collection method provided by `collector` to determine the output data. | Optional | +| void beforeDestroy() | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | + +In the life cycle of a UDTF instance, the calling sequence of each method is as follows: + +1. void validate(UDFParameterValidator validator) throws Exception +2. void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception +3. void transform(Row row, PointCollector collector) throws Exception or void transform(RowWindow rowWindow, PointCollector collector) throws Exception +4. void terminate(PointCollector collector) throws Exception +5. void beforeDestroy() + +> Note that every time the framework executes a UDTF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDTF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDTF without considering the influence of concurrency and other factors. + +#### Detailed interface introduction: + +1. **void validate(UDFParameterValidator validator) throws Exception** + +The `validate` method is used to validate the parameters entered by the user. + +In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. + +Please refer to the Javadoc for the usage of `UDFParameterValidator`. + + +2. **void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception** + +This method is mainly used to customize UDTF. In this method, the user can do the following things: + +1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. +2. Set the strategy to access the raw data and set the output data type in UDTFConfigurations. +3. Create resources, such as establishing external connections, opening files, etc. + + +2.1 **UDFParameters** + +`UDFParameters` is used to parse UDF parameters in SQL statements (the part in parentheses after the UDF function name in SQL). The input parameters have two parts. The first part is data types of the time series that the UDF needs to process, and the second part is the key-value pair attributes for customization. Only the second part can be empty. + + +Example: + +``` sql +SELECT UDF(s1, s2, 'key1'='iotdb', 'key2'='123.45') FROM root.sg.d; +``` + +Usage: + +``` java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + String stringValue = parameters.getString("key1"); // iotdb + Float floatValue = parameters.getFloat("key2"); // 123.45 + Double doubleValue = parameters.getDouble("key3"); // null + int intValue = parameters.getIntOrDefault("key4", 678); // 678 + // do something + + // configurations + // ... +} +``` + + +2.2 **UDTFConfigurations** + +You must use `UDTFConfigurations` to specify the strategy used by UDF to access raw data and the type of output sequence. + +Usage: + +``` java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + // parameters + // ... + + // configurations + configurations + .setAccessStrategy(new RowByRowAccessStrategy()) + .setOutputDataType(Type.INT32); +} +``` + +The `setAccessStrategy` method is used to set the UDF's strategy for accessing the raw data, and the `setOutputDataType` method is used to set the data type of the output sequence. + + 2.2.1 **setAccessStrategy** + + +Note that the raw data access strategy you set here determines which `transform` method the framework will call. Please implement the `transform` method corresponding to the raw data access strategy. Of course, you can also dynamically decide which strategy to set based on the attribute parameters parsed by `UDFParameters`. Therefore, two `transform` methods are also allowed to be implemented in one UDF. + +The following are the strategies you can set: + +| Interface definition | Description | The `transform` Method to Call | +| :-------------------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ | +| MappableRowByRow | Custom scalar function
The framework will call the `transform` method once for each row of raw data input, with k columns of time series and 1 row of data input, and 1 column of time series and 1 row of data output. It can be used in any clause and expression where scalar functions appear, such as select clauses, where clauses, etc. | void transform(Column[] columns, ColumnBuilder builder) throws ExceptionObject transform(Row row) throws Exception | +| RowByRowAccessStrategy | Customize time series generation function to process raw data line by line.
The framework will call the `transform` method once for each row of raw data input, inputting k columns of time series and 1 row of data, and outputting 1 column of time series and n rows of data.
When a sequence is input, the row serves as a data point for the input sequence.
When multiple sequences are input, after aligning the input sequences in time, each row serves as a data point for the input sequence.
(In a row of data, there may be a column with a `null` value, but not all columns are `null`) | void transform(Row row, PointCollector collector) throws Exception | +| SlidingTimeWindowAccessStrategy | Customize time series generation functions to process raw data in a sliding time window manner.
The framework will call the `transform` method once for each raw data input window, input k columns of time series m rows of data, and output 1 column of time series n rows of data.
A window may contain multiple rows of data, and after aligning the input sequence in time, each window serves as a data point for the input sequence.
(Each window may have i rows, and each row of data may have a column with a `null` value, but not all of them are `null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| SlidingSizeWindowAccessStrategy | Customize the time series generation function to process raw data in a fixed number of rows, meaning that each data processing window will contain a fixed number of rows of data (except for the last window).
The framework will call the `transform` method once for each raw data input window, input k columns of time series m rows of data, and output 1 column of time series n rows of data.
A window may contain multiple rows of data, and after aligning the input sequence in time, each window serves as a data point for the input sequence.
(Each window may have i rows, and each row of data may have a column with a `null` value, but not all of them are `null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| SessionTimeWindowAccessStrategy | Customize time series generation functions to process raw data in a session window format.
The framework will call the `transform` method once for each raw data input window, input k columns of time series m rows of data, and output 1 column of time series n rows of data.
A window may contain multiple rows of data, and after aligning the input sequence in time, each window serves as a data point for the input sequence.
(Each window may have i rows, and each row of data may have a column with a `null` value, but not all of them are `null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| StateWindowAccessStrategy | Customize time series generation functions to process raw data in a state window format.
he framework will call the `transform` method once for each raw data input window, inputting 1 column of time series m rows of data and outputting 1 column of time series n rows of data.
A window may contain multiple rows of data, and currently only supports opening windows for one physical quantity, which is one column of data. | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | + + +#### Interface Description: + +- `RowByRowAccessStrategy`: The construction of `RowByRowAccessStrategy` does not require any parameters. + +- `SlidingTimeWindowAccessStrategy` + +Window opening diagram: + + + +`SlidingTimeWindowAccessStrategy`: `SlidingTimeWindowAccessStrategy` has many constructors, you can pass 3 types of parameters to them: + +- Parameter 1: The display window on the time axis + +The first type of parameters are optional. If the parameters are not provided, the beginning time of the display window will be set to the same as the minimum timestamp of the query result set, and the ending time of the display window will be set to the same as the maximum timestamp of the query result set. + +- Parameter 2: Time interval for dividing the time axis (should be positive) +- Parameter 3: Time sliding step (not required to be greater than or equal to the time interval, but must be a positive number) + +The sliding step parameter is also optional. If the parameter is not provided, the sliding step will be set to the same as the time interval for dividing the time axis. + +The relationship between the three types of parameters can be seen in the figure below. Please see the Javadoc for more details. + +
+ +> Note that the actual time interval of some of the last time windows may be less than the specified time interval parameter. In addition, there may be cases where the number of data rows in some time windows is 0. In these cases, the framework will also call the `transform` method for the empty windows. + +- `SlidingSizeWindowAccessStrategy` + +Window opening diagram: + + + +`SlidingSizeWindowAccessStrategy`: `SlidingSizeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: + +* Parameter 1: Window size. This parameter specifies the number of data rows contained in a data processing window. Note that the number of data rows in some of the last time windows may be less than the specified number of data rows. +* Parameter 2: Sliding step. This parameter means the number of rows between the first point of the next window and the first point of the current window. (This parameter is not required to be greater than or equal to the window size, but must be a positive number) + +The sliding step parameter is optional. If the parameter is not provided, the sliding step will be set to the same as the window size. + +- `SessionTimeWindowAccessStrategy` + +Window opening diagram: **Time intervals less than or equal to the given minimum time interval `sessionGap` are assigned in one group.** + + + +`SessionTimeWindowAccessStrategy`: `SessionTimeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: + +- Parameter 1: The display window on the time axis. +- Parameter 2: The minimum time interval `sessionGap` of two adjacent windows. + +- `StateWindowAccessStrategy` + +Window opening diagram: **For numerical data, if the state difference is less than or equal to the given threshold `delta`, it will be assigned in one group.** + + + +`StateWindowAccessStrategy` has four constructors. + +- Constructor 1: For numerical data, there are 3 parameters: the time axis can display the start and end time of the time window and the threshold `delta` for the allowable change within a single window. +- Constructor 2: For text data and boolean data, there are 3 parameters: the time axis can be provided to display the start and end time of the time window. For both data types, the data within a single window is same, and there is no need to provide an allowable change threshold. +- Constructor 3: For numerical data, there are 1 parameters: you can only provide the threshold delta that is allowed to change within a single window. The start time of the time axis display time window will be defined as the smallest timestamp in the entire query result set, and the time axis display time window end time will be defined as The largest timestamp in the entire query result set. +- Constructor 4: For text data and boolean data, you can provide no parameter. The start and end timestamps are explained in Constructor 3. + +StateWindowAccessStrategy can only take one column as input for now. + +Please see the Javadoc for more details. + + 2.2.2 **setOutputDataType** + +Note that the type of output sequence you set here determines the type of data that the `PointCollector` can actually receive in the `transform` method. The relationship between the output data type set in `setOutputDataType` and the actual data output type that `PointCollector` can receive is as follows: + +| Output Data Type Set in `setOutputDataType` | Data Type that `PointCollector` Can Receive | +| :------------------------------------------ | :----------------------------------------------------------- | +| INT32 | int | +| INT64 | long | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | boolean | +| TEXT | java.lang.String and org.apache.iotdb.udf.api.type.Binar` | + +The type of output time series of a UDTF is determined at runtime, which means that a UDTF can dynamically determine the type of output time series according to the type of input time series. +Here is a simple example: + +```java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + // do something + // ... + + configurations + .setAccessStrategy(new RowByRowAccessStrategy()) + .setOutputDataType(parameters.getDataType(0)); +} +``` + +3. **void transform(Row row, PointCollector collector) throws Exception** + +You need to implement this method when you specify the strategy of UDF to read the original data as `RowByRowAccessStrategy`. + +This method processes the raw data one row at a time. The raw data is input from `Row` and output by `PointCollector`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. + +The following is a complete UDF example that implements the `void transform(Row row, PointCollector collector) throws Exception` method. It is an adder that receives two columns of time series as input. When two data points in a row are not `null`, this UDF will output the algebraic sum of these two data points. + +``` java +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Adder implements UDTF { + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT64) + .setAccessStrategy(new RowByRowAccessStrategy()); + } + + @Override + public void transform(Row row, PointCollector collector) throws Exception { + if (row.isNull(0) || row.isNull(1)) { + return; + } + collector.putLong(row.getTime(), row.getLong(0) + row.getLong(1)); + } +} +``` + +4. **void transform(RowWindow rowWindow, PointCollector collector) throws Exception** + +You need to implement this method when you specify the strategy of UDF to read the original data as `SlidingTimeWindowAccessStrategy` or `SlidingSizeWindowAccessStrategy`. + +This method processes a batch of data in a fixed number of rows or a fixed time interval each time, and we call the container containing this batch of data a window. The raw data is input from `RowWindow` and output by `PointCollector`. `RowWindow` can help you access a batch of `Row`, it provides a set of interfaces for random access and iterative access to this batch of `Row`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. + +Below is a complete UDF example that implements the `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` method. It is a counter that receives any number of time series as input, and its function is to count and output the number of data rows in each time window within a specified time range. + +```java +import java.io.IOException; +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.access.RowWindow; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.SlidingTimeWindowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Counter implements UDTF { + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT32) + .setAccessStrategy(new SlidingTimeWindowAccessStrategy( + parameters.getLong("time_interval"), + parameters.getLong("sliding_step"), + parameters.getLong("display_window_begin"), + parameters.getLong("display_window_end"))); + } + + @Override + public void transform(RowWindow rowWindow, PointCollector collector) { + if (rowWindow.windowSize() != 0) { + collector.putInt(rowWindow.windowStartTime(), rowWindow.windowSize()); + } + } +} +``` + +5. **void terminate(PointCollector collector) throws Exception** + +In some scenarios, a UDF needs to traverse all the original data to calculate the final output data points. The `terminate` interface provides support for those scenarios. + +This method is called after all `transform` calls are executed and before the `beforeDestory` method is executed. You can implement the `transform` method to perform pure data processing (without outputting any data points), and implement the `terminate` method to output the processing results. + +The processing results need to be output by the `PointCollector`. You can output any number of data points in one `terminate` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. + +Below is a complete UDF example that implements the `void terminate(PointCollector collector) throws Exception` method. It takes one time series whose data type is `INT32` as input, and outputs the maximum value point of the series. + +```java +import java.io.IOException; +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Max implements UDTF { + + private Long time; + private int value; + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT32) + .setAccessStrategy(new RowByRowAccessStrategy()); + } + + @Override + public void transform(Row row, PointCollector collector) { + if (row.isNull(0)) { + return; + } + int candidateValue = row.getInt(0); + if (time == null || value < candidateValue) { + time = row.getTime(); + value = candidateValue; + } + } + + @Override + public void terminate(PointCollector collector) throws IOException { + if (time != null) { + collector.putInt(time, value); + } + } +} +``` + +6. **void beforeDestroy()** + +The method for terminating a UDF. + +This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. + + + +### UDAF (User Defined Aggregation Function) + +A complete definition of UDAF involves two classes, `State` and `UDAF`. + +#### State Class + +To write your own `State`, you need to implement the `org.apache.iotdb.udf.api.State` interface. + +#### Interface Description: + +| Interface Definition | Description | Required to Implement | +| -------------------------------- | ------------------------------------------------------------ | --------------------- | +| void reset() | To reset the `State` object to its initial state, you need to fill in the initial values of the fields in the `State` class within this method as if you were writing a constructor. | Required | +| byte[] serialize() | Serializes `State` to binary data. This method is used for IoTDB internal `State` passing. Note that the order of serialization must be consistent with the following deserialization methods. | Required | +| void deserialize(byte[] bytes) | Deserializes binary data to `State`. This method is used for IoTDB internal `State` passing. Note that the order of deserialization must be consistent with the serialization method above. | Required | + +#### Detailed interface introduction: + +1. **void reset()** + +This method resets the `State` to its initial state, you need to fill in the initial values of the fields in the `State` object in this method. For optimization reasons, IoTDB reuses `State` as much as possible internally, rather than creating a new `State` for each group, which would introduce unnecessary overhead. When `State` has finished updating the data in a group, this method is called to reset to the initial state as a way to process the next group. + +In the case of `State` for averaging (aka `avg`), for example, you would need the sum of the data, `sum`, and the number of entries in the data, `count`, and initialize both to 0 in the `reset()` method. + +```java +class AvgState implements State { + double sum; + + long count; + + @Override + public void reset() { + sum = 0; + count = 0; + } + + // other methods +} +``` + +2. **byte[] serialize()/void deserialize(byte[] bytes)** + +These methods serialize the `State` into binary data, and deserialize the `State` from the binary data. IoTDB, as a distributed database, involves passing data among different nodes, so you need to write these two methods to enable the passing of the State among different nodes. Note that the order of serialization and deserialization must be the consistent. + +In the case of `State` for averaging (aka `avg`), for example, you can convert the content of State to `byte[]` array and read out the content of State from `byte[]` array in any way you want, the following shows the code for serialization/deserialization using `ByteBuffer` introduced by Java8: + +```java +@Override +public byte[] serialize() { + ByteBuffer buffer = ByteBuffer.allocate(Double.BYTES + Long.BYTES); + buffer.putDouble(sum); + buffer.putLong(count); + + return buffer.array(); +} + +@Override +public void deserialize(byte[] bytes) { + ByteBuffer buffer = ByteBuffer.wrap(bytes); + sum = buffer.getDouble(); + count = buffer.getLong(); +} +``` + + + +#### UDAF Classes + +To write a UDAF, you need to implement the `org.apache.iotdb.udf.api.UDAF` interface. + +#### Interface Description: + +| Interface definition | Description | Required to Implement | +| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------- | +| void validate(UDFParameterValidator validator) throws Exception | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | +| void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception | Initialization method that invokes user-defined initialization behavior before UDAF processes the input data. Unlike UDTF, configuration is of type `UDAFConfiguration`. | Required | +| State createState() | To create a `State` object, usually just call the default constructor and modify the default initial value as needed. | Required | +| void addInput(State state, Column[] columns, BitMap bitMap) | Update `State` object according to the incoming data `Column[]` in batch, note that last column `columns[columns.length - 1]` always represents the time column. In addition, `BitMap` represents the data that has been filtered out before, you need to manually determine whether the corresponding data has been filtered out when writing this method. | Required | +| void combineState(State state, State rhs) | Merge `rhs` state into `state` state. In a distributed scenario, the same set of data may be distributed on different nodes, IoTDB generates a `State` object for the partial data on each node, and then calls this method to merge it into the complete `State`. | Required | +| void outputFinal(State state, ResultValue resultValue) | Computes the final aggregated result based on the data in `State`. Note that according to the semantics of the aggregation, only one value can be output per group. | Required | +| void beforeDestroy() | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | + +In the life cycle of a UDAF instance, the calling sequence of each method is as follows: + +1. State createState() +2. void validate(UDFParameterValidator validator) throws Exception +3. void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception +4. void addInput(State state, Column[] columns, BitMap bitMap) +5. void combineState(State state, State rhs) +6. void outputFinal(State state, ResultValue resultValue) +7. void beforeDestroy() + +Similar to UDTF, every time the framework executes a UDAF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDAF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDAF without considering the influence of concurrency and other factors. + +#### Detailed interface introduction: + + +1. **void validate(UDFParameterValidator validator) throws Exception** + +Same as UDTF, the `validate` method is used to validate the parameters entered by the user. + +In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. + +2. **void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception** + + The `beforeStart` method does the same thing as the UDAF: + +1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. +2. Set the strategy to access the raw data and set the output data type in UDAFConfigurations. +3. Create resources, such as establishing external connections, opening files, etc. + +The role of the `UDFParameters` type can be seen above. + +2.2 **UDTFConfigurations** + +The difference from UDTF is that UDAF uses `UDAFConfigurations` as the type of `configuration` object. + +Currently, this class only supports setting the type of output data. + +```java +void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { + // parameters + // ... + + // configurations + configurations + .setOutputDataType(Type.INT32); } +} +``` + +The relationship between the output type set in `setOutputDataType` and the type of data output that `ResultValue` can actually receive is as follows: + +| The output type set in `setOutputDataType` | The output type that `ResultValue` can actually receive | +| ------------------------------------------ | ------------------------------------------------------- | +| INT32 | int | +| INT64 | long | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | boolean | +| TEXT | org.apache.iotdb.udf.api.type.Binary | + +The output type of the UDAF is determined at runtime. You can dynamically determine the output sequence type based on the input type. + +Here is a simple example: + +```java +void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { + // do something + // ... + + configurations + .setOutputDataType(parameters.getDataType(0)); +} +``` + +3. **State createState()** + + +This method creates and initializes a `State` object for UDAF. Due to the limitations of the Java language, you can only call the default constructor for the `State` class. The default constructor assigns a default initial value to all the fields in the class, and if that initial value does not meet your requirements, you need to initialize them manually within this method. + +The following is an example that includes manual initialization. Suppose you want to implement an aggregate function that multiply all numbers in the group, then your initial `State` value should be set to 1, but the default constructor initializes it to 0, so you need to initialize `State` manually after calling the default constructor: + +```java +public State createState() { + MultiplyState state = new MultiplyState(); + state.result = 1; + return state; +} +``` + +4. **void addInput(State state, Column[] columns, BitMap bitMap)** + +This method updates the `State` object with the raw input data. For performance reasons, also to align with the IoTDB vectorized query engine, the raw input data is no longer a data point, but an array of columns ``Column[]``. Note that the last column (i.e. `columns[columns.length - 1]`) is always the time column, so you can also do different operations in UDAF depending on the time. + +Since the input parameter is not of a single data point type, but of multiple columns, you need to manually filter some of the data in the columns, which is why the third parameter, `BitMap`, exists. It identifies which of these columns have been filtered out, so you don't have to think about the filtered data in any case. + +Here's an example of `addInput()` that counts the number of items (aka count). It shows how you can use `BitMap` to ignore data that has been filtered out. Note that due to the limitations of the Java language, you need to do the explicit cast the `State` object from type defined in the interface to a custom `State` type at the beginning of the method, otherwise you won't be able to use the `State` object. + +```java +public void addInput(State state, Column[] columns, BitMap bitMap) { + CountState countState = (CountState) state; + + int count = columns[0].getPositionCount(); + for (int i = 0; i < count; i++) { + if (bitMap != null && !bitMap.isMarked(i)) { + continue; + } + if (!columns[0].isNull(i)) { + countState.count++; + } + } +} +``` + +5. **void combineState(State state, State rhs)** + + +This method combines two `State`s, or more precisely, updates the first `State` object with the second `State` object. IoTDB is a distributed database, and the data of the same group may be distributed on different nodes. For performance reasons, IoTDB will first aggregate some of the data on each node into `State`, and then merge the `State`s on different nodes that belong to the same group, which is what `combineState` does. + +Here's an example of `combineState()` for averaging (aka avg). Similar to `addInput`, you need to do an explicit type conversion for the two `State`s at the beginning. Also note that you are updating the value of the first `State` with the contents of the second `State`. + +```java +public void combineState(State state, State rhs) { + AvgState avgState = (AvgState) state; + AvgState avgRhs = (AvgState) rhs; + + avgState.count += avgRhs.count; + avgState.sum += avgRhs.sum; +} +``` + +6. **void outputFinal(State state, ResultValue resultValue)** + +This method works by calculating the final result from `State`. You need to access the various fields in `State`, derive the final result, and set the final result into the `ResultValue` object.IoTDB internally calls this method once at the end for each group. Note that according to the semantics of aggregation, the final result can only be one value. + +Here is another `outputFinal` example for averaging (aka avg). In addition to the forced type conversion at the beginning, you will also see a specific use of the `ResultValue` object, where the final result is set by `setXXX` (where `XXX` is the type name). + +```java +public void outputFinal(State state, ResultValue resultValue) { + AvgState avgState = (AvgState) state; + + if (avgState.count != 0) { + resultValue.setDouble(avgState.sum / avgState.count); + } else { + resultValue.setNull(); + } +} +``` + +7. **void beforeDestroy()** + + +The method for terminating a UDF. + +This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. + + +### Maven Project Example + +If you use Maven, you can build your own UDF project referring to our **udf-example** module. You can find the project [here](https://github.com/apache/iotdb/tree/master/example/udf). + + +## Contribute universal built-in UDF functions to iotdb + +This part mainly introduces how external users can contribute their own UDFs to the IoTDB community. + +#### Prerequisites + +1. UDFs must be universal. + + The "universal" mentioned here refers to: UDFs can be widely used in some scenarios. In other words, the UDF function must have reuse value and may be directly used by other users in the community. + + If you are not sure whether the UDF you want to contribute is universal, you can send an email to `dev@iotdb.apache.org` or create an issue to initiate a discussion. + +2. The UDF you are going to contribute has been well tested and can run normally in the production environment. + + +#### What you need to prepare + +1. UDF source code +2. Test cases +3. Instructions + +#### UDF Source Code + +1. Create the UDF main class and related classes in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin` or in its subfolders. +2. Register your UDF in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`. + +#### Test Cases + +At a minimum, you need to write integration tests for the UDF. + +You can add a test class in `integration-test/src/test/java/org/apache/iotdb/db/it/udf`. + + +#### Instructions + +The instructions need to include: the name and the function of the UDF, the attribute parameters that must be provided when the UDF is executed, the applicable scenarios, and the usage examples, etc. + +The instructions for use should include both Chinese and English versions. Instructions for use should be added separately in `docs/zh/UserGuide/Operation Manual/DML Data Manipulation Language.md` and `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md`. + +#### Submit a PR + +When you have prepared the UDF source code, test cases, and instructions, you are ready to submit a Pull Request (PR) on [Github](https://github.com/apache/iotdb). You can refer to our code contribution guide to submit a PR: [Development Guide](https://iotdb.apache.org/Community/Development-Guide.html). + + +After the PR review is approved and merged, your UDF has already contributed to the IoTDB community! diff --git a/src/UserGuide/latest/User-Manual/Database-Programming.md b/src/UserGuide/latest/User-Manual/Database-Programming.md index ce100e750..2386a55b4 100644 --- a/src/UserGuide/latest/User-Manual/Database-Programming.md +++ b/src/UserGuide/latest/User-Manual/Database-Programming.md @@ -1036,890 +1036,3 @@ SELECT avg(count_s1) from root.sg_count.d; | :------------------------------------------ | ------------------------------------------------------------ | --------- | ------------- | | `continuous_query_submit_thread` | The number of threads in the scheduled thread pool that submit continuous query tasks periodically | int32 | 2 | | `continuous_query_min_every_interval_in_ms` | The minimum value of the continuous query execution time interval | duration | 1000 | - -## USER-DEFINED FUNCTION (UDF) - -IoTDB provides a variety of built-in functions to meet your computing needs, and you can also create user defined functions to meet more computing needs. - -This document describes how to write, register and use a UDF. - - -### UDF Types - -In IoTDB, you can expand two types of UDF: - -| UDF Class | Description | -| --------------------------------------------------- | ------------------------------------------------------------ | -| UDTF(User Defined Timeseries Generating Function) | This type of function can take **multiple** time series as input, and output **one** time series, which can have any number of data points. | -| UDAF(User Defined Aggregation Function) | Custom Aggregation Functions. This type of function can take one time series as input, and output **one** aggregated data point for each group based on the GROUP BY type. | - -### UDF Development Dependencies - -If you use [Maven](http://search.maven.org/), you can search for the development dependencies listed below from the [Maven repository](http://search.maven.org/) . Please note that you must select the same dependency version as the target IoTDB server version for development. - -``` xml - - org.apache.iotdb - udf-api - 1.0.0 - provided - -``` - -### UDTF(User Defined Timeseries Generating Function) - -To write a UDTF, you need to inherit the `org.apache.iotdb.udf.api.UDTF` class, and at least implement the `beforeStart` method and a `transform` method. - -The following table shows all the interfaces available for user implementation. - -| Interface definition | Description | Required to Implement | -| :----------------------------------------------------------- | :----------------------------------------------------------- | ----------------------------------------------------- | -| `void validate(UDFParameterValidator validator) throws Exception` | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | -| `void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception` | The initialization method to call the user-defined initialization behavior before a UDTF processes the input data. Every time a user executes a UDTF query, the framework will construct a new UDF instance, and `beforeStart` will be called. | Required | -| `void transform(Row row, PointCollector collector) throws Exception` | This method is called by the framework. This data processing method will be called when you choose to use the `RowByRowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `Row`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | -| `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | This method is called by the framework. This data processing method will be called when you choose to use the `SlidingSizeWindowAccessStrategy` or `SlidingTimeWindowAccessStrategy` strategy (set in `beforeStart`) to consume raw data. Input data is passed in by `RowWindow`, and the transformation result should be output by `PointCollector`. You need to call the data collection method provided by `collector` to determine the output data. | Required to implement at least one `transform` method | -| `void terminate(PointCollector collector) throws Exception` | This method is called by the framework. This method will be called once after all `transform` calls have been executed. In a single UDF query, this method will and will only be called once. You need to call the data collection method provided by `collector` to determine the output data. | Optional | -| `void beforeDestroy() ` | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | - -In the life cycle of a UDTF instance, the calling sequence of each method is as follows: - -1. `void validate(UDFParameterValidator validator) throws Exception` -2. `void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception` -3. `void transform(Row row, PointCollector collector) throws Exception` or `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` -4. `void terminate(PointCollector collector) throws Exception` -5. `void beforeDestroy() ` - -Note that every time the framework executes a UDTF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDTF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDTF without considering the influence of concurrency and other factors. - -The usage of each interface will be described in detail below. - - - -#### void validate(UDFParameterValidator validator) throws Exception - -The `validate` method is used to validate the parameters entered by the user. - -In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. - -Please refer to the Javadoc for the usage of `UDFParameterValidator`. - - - -#### void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception - -This method is mainly used to customize UDTF. In this method, the user can do the following things: - -1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. -2. Set the strategy to access the raw data and set the output data type in UDTFConfigurations. -3. Create resources, such as establishing external connections, opening files, etc. - - - - -##### UDFParameters - -`UDFParameters` is used to parse UDF parameters in SQL statements (the part in parentheses after the UDF function name in SQL). The input parameters have two parts. The first part is data types of the time series that the UDF needs to process, and the second part is the key-value pair attributes for customization. Only the second part can be empty. - - -Example: - -``` sql -SELECT UDF(s1, s2, 'key1'='iotdb', 'key2'='123.45') FROM root.sg.d; -``` - -Usage: - -``` java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - String stringValue = parameters.getString("key1"); // iotdb - Float floatValue = parameters.getFloat("key2"); // 123.45 - Double doubleValue = parameters.getDouble("key3"); // null - int intValue = parameters.getIntOrDefault("key4", 678); // 678 - // do something - - // configurations - // ... -} -``` - - - -##### UDTFConfigurations - -You must use `UDTFConfigurations` to specify the strategy used by UDF to access raw data and the type of output sequence. - -Usage: - -``` java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - // parameters - // ... - - // configurations - configurations - .setAccessStrategy(new RowByRowAccessStrategy()) - .setOutputDataType(Type.INT32); -} -``` - -The `setAccessStrategy` method is used to set the UDF's strategy for accessing the raw data, and the `setOutputDataType` method is used to set the data type of the output sequence. - - - -###### setAccessStrategy - -Note that the raw data access strategy you set here determines which `transform` method the framework will call. Please implement the `transform` method corresponding to the raw data access strategy. Of course, you can also dynamically decide which strategy to set based on the attribute parameters parsed by `UDFParameters`. Therefore, two `transform` methods are also allowed to be implemented in one UDF. - -The following are the strategies you can set: - -| Interface definition | Description | The `transform` Method to Call | -| :-------------------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ | -| `RowByRowAccessStrategy` | Process raw data row by row. The framework calls the `transform` method once for each row of raw data input. When UDF has only one input sequence, a row of input is one data point in the input sequence. When UDF has multiple input sequences, one row of input is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(Row row, PointCollector collector) throws Exception` | -| `SlidingTimeWindowAccessStrategy` | Process a batch of data in a fixed time interval each time. We call the container of a data batch a window. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window, and each row is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `SlidingSizeWindowAccessStrategy` | The raw data is processed batch by batch, and each batch contains a fixed number of raw data rows (except the last batch). We call the container of a data batch a window. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window, and each row is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `SessionTimeWindowAccessStrategy` | The raw data is processed batch by batch. We call the container of a data batch a window. The time interval between each two windows is greater than or equal to the `sessionGap` given by the user. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window, and each row is a result record of the raw query (aligned by time) on these input sequences. (In a row, there may be a column with a value of `null`, but not all of them are `null`) | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `StateWindowAccessStrategy` | The raw data is processed batch by batch. We call the container of a data batch a window. In the state window, for text type or boolean type data, each value of the point in window is equal to the value of the first point in the window, and for numerical data, the distance between each value of the point in window and the value of the first point in the window is less than the threshold `delta` given by the user. The framework calls the `transform` method once for each raw data input window. There may be multiple rows of data in a window. Currently, we only support state window for one measurement, that is, a column of data. | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | - - -`RowByRowAccessStrategy`: The construction of `RowByRowAccessStrategy` does not require any parameters. - -The `SlidingTimeWindowAccessStrategy` is shown schematically below. - - -`SlidingTimeWindowAccessStrategy`: `SlidingTimeWindowAccessStrategy` has many constructors, you can pass 3 types of parameters to them: - -- Parameter 1: The display window on the time axis -- Parameter 2: Time interval for dividing the time axis (should be positive) -- Parameter 3: Time sliding step (not required to be greater than or equal to the time interval, but must be a positive number) - -The first type of parameters are optional. If the parameters are not provided, the beginning time of the display window will be set to the same as the minimum timestamp of the query result set, and the ending time of the display window will be set to the same as the maximum timestamp of the query result set. - -The sliding step parameter is also optional. If the parameter is not provided, the sliding step will be set to the same as the time interval for dividing the time axis. - -The relationship between the three types of parameters can be seen in the figure below. Please see the Javadoc for more details. - -
- -Note that the actual time interval of some of the last time windows may be less than the specified time interval parameter. In addition, there may be cases where the number of data rows in some time windows is 0. In these cases, the framework will also call the `transform` method for the empty windows. - -The `SlidingSizeWindowAccessStrategy` is shown schematically below. - - -`SlidingSizeWindowAccessStrategy`: `SlidingSizeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: - -* Parameter 1: Window size. This parameter specifies the number of data rows contained in a data processing window. Note that the number of data rows in some of the last time windows may be less than the specified number of data rows. -* Parameter 2: Sliding step. This parameter means the number of rows between the first point of the next window and the first point of the current window. (This parameter is not required to be greater than or equal to the window size, but must be a positive number) - -The sliding step parameter is optional. If the parameter is not provided, the sliding step will be set to the same as the window size. - -The `SessionTimeWindowAccessStrategy` is shown schematically below. **Time intervals less than or equal to the given minimum time interval `sessionGap` are assigned in one group** - - -`SessionTimeWindowAccessStrategy`: `SessionTimeWindowAccessStrategy` has many constructors, you can pass 2 types of parameters to them: - -- Parameter 1: The display window on the time axis. -- Parameter 2: The minimum time interval `sessionGap` of two adjacent windows. - - -The `StateWindowAccessStrategy` is shown schematically below. **For numerical data, if the state difference is less than or equal to the given threshold `delta`, it will be assigned in one group. ** - - -`StateWindowAccessStrategy` has four constructors. - -- Constructor 1: For numerical data, there are 3 parameters: the time axis can display the start and end time of the time window and the threshold `delta` for the allowable change within a single window. -- Constructor 2: For text data and boolean data, there are 3 parameters: the time axis can be provided to display the start and end time of the time window. For both data types, the data within a single window is same, and there is no need to provide an allowable change threshold. -- Constructor 3: For numerical data, there are 1 parameters: you can only provide the threshold delta that is allowed to change within a single window. The start time of the time axis display time window will be defined as the smallest timestamp in the entire query result set, and the time axis display time window end time will be defined as The largest timestamp in the entire query result set. -- Constructor 4: For text data and boolean data, you can provide no parameter. The start and end timestamps are explained in Constructor 3. - -StateWindowAccessStrategy can only take one column as input for now. - -Please see the Javadoc for more details. - - - -###### setOutputDataType - -Note that the type of output sequence you set here determines the type of data that the `PointCollector` can actually receive in the `transform` method. The relationship between the output data type set in `setOutputDataType` and the actual data output type that `PointCollector` can receive is as follows: - -| Output Data Type Set in `setOutputDataType` | Data Type that `PointCollector` Can Receive | -| :------------------------------------------ | :----------------------------------------------------------- | -| `INT32` | `int` | -| `INT64` | `long` | -| `FLOAT` | `float` | -| `DOUBLE` | `double` | -| `BOOLEAN` | `boolean` | -| `TEXT` | `java.lang.String` and `org.apache.iotdb.udf.api.type.Binary` | - -The type of output time series of a UDTF is determined at runtime, which means that a UDTF can dynamically determine the type of output time series according to the type of input time series. -Here is a simple example: - -```java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - // do something - // ... - - configurations - .setAccessStrategy(new RowByRowAccessStrategy()) - .setOutputDataType(parameters.getDataType(0)); -} -``` - - - -#### void transform(Row row, PointCollector collector) throws Exception - -You need to implement this method when you specify the strategy of UDF to read the original data as `RowByRowAccessStrategy`. - -This method processes the raw data one row at a time. The raw data is input from `Row` and output by `PointCollector`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. - -The following is a complete UDF example that implements the `void transform(Row row, PointCollector collector) throws Exception` method. It is an adder that receives two columns of time series as input. When two data points in a row are not `null`, this UDF will output the algebraic sum of these two data points. - -``` java -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Adder implements UDTF { - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT64) - .setAccessStrategy(new RowByRowAccessStrategy()); - } - - @Override - public void transform(Row row, PointCollector collector) throws Exception { - if (row.isNull(0) || row.isNull(1)) { - return; - } - collector.putLong(row.getTime(), row.getLong(0) + row.getLong(1)); - } -} -``` - - - -#### void transform(RowWindow rowWindow, PointCollector collector) throws Exception - -You need to implement this method when you specify the strategy of UDF to read the original data as `SlidingTimeWindowAccessStrategy` or `SlidingSizeWindowAccessStrategy`. - -This method processes a batch of data in a fixed number of rows or a fixed time interval each time, and we call the container containing this batch of data a window. The raw data is input from `RowWindow` and output by `PointCollector`. `RowWindow` can help you access a batch of `Row`, it provides a set of interfaces for random access and iterative access to this batch of `Row`. You can output any number of data points in one `transform` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. - -Below is a complete UDF example that implements the `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` method. It is a counter that receives any number of time series as input, and its function is to count and output the number of data rows in each time window within a specified time range. - -```java -import java.io.IOException; -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.access.RowWindow; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.SlidingTimeWindowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Counter implements UDTF { - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT32) - .setAccessStrategy(new SlidingTimeWindowAccessStrategy( - parameters.getLong("time_interval"), - parameters.getLong("sliding_step"), - parameters.getLong("display_window_begin"), - parameters.getLong("display_window_end"))); - } - - @Override - public void transform(RowWindow rowWindow, PointCollector collector) { - if (rowWindow.windowSize() != 0) { - collector.putInt(rowWindow.windowStartTime(), rowWindow.windowSize()); - } - } -} -``` - - - -#### void terminate(PointCollector collector) throws Exception - -In some scenarios, a UDF needs to traverse all the original data to calculate the final output data points. The `terminate` interface provides support for those scenarios. - -This method is called after all `transform` calls are executed and before the `beforeDestory` method is executed. You can implement the `transform` method to perform pure data processing (without outputting any data points), and implement the `terminate` method to output the processing results. - -The processing results need to be output by the `PointCollector`. You can output any number of data points in one `terminate` method call. It should be noted that the type of output data points must be the same as you set in the `beforeStart` method, and the timestamps of output data points must be strictly monotonically increasing. - -Below is a complete UDF example that implements the `void terminate(PointCollector collector) throws Exception` method. It takes one time series whose data type is `INT32` as input, and outputs the maximum value point of the series. - -```java -import java.io.IOException; -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Max implements UDTF { - - private Long time; - private int value; - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT32) - .setAccessStrategy(new RowByRowAccessStrategy()); - } - - @Override - public void transform(Row row, PointCollector collector) { - if (row.isNull(0)) { - return; - } - int candidateValue = row.getInt(0); - if (time == null || value < candidateValue) { - time = row.getTime(); - value = candidateValue; - } - } - - @Override - public void terminate(PointCollector collector) throws IOException { - if (time != null) { - collector.putInt(time, value); - } - } -} -``` - - - -#### void beforeDestroy() - -The method for terminating a UDF. - -This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. - - - -### UDAF (User Defined Aggregation Function) - -A complete definition of UDAF involves two classes, `State` and `UDAF`. - -#### State Class - -To write your own `State`, you need to implement the `org.apache.iotdb.udf.api.State` interface. - -The following table shows all the interfaces available for user implementation. - -| Interface Definition | Description | Required to Implement | -| -------------------------------- | ------------------------------------------------------------ | --------------------- | -| `void reset()` | To reset the `State` object to its initial state, you need to fill in the initial values of the fields in the `State` class within this method as if you were writing a constructor. | Required | -| `byte[] serialize()` | Serializes `State` to binary data. This method is used for IoTDB internal `State` passing. Note that the order of serialization must be consistent with the following deserialization methods. | Required | -| `void deserialize(byte[] bytes)` | Deserializes binary data to `State`. This method is used for IoTDB internal `State` passing. Note that the order of deserialization must be consistent with the serialization method above. | Required | - -The following section describes the usage of each interface in detail. - - - -##### void reset() - -This method resets the `State` to its initial state, you need to fill in the initial values of the fields in the `State` object in this method. For optimization reasons, IoTDB reuses `State` as much as possible internally, rather than creating a new `State` for each group, which would introduce unnecessary overhead. When `State` has finished updating the data in a group, this method is called to reset to the initial state as a way to process the next group. - -In the case of `State` for averaging (aka `avg`), for example, you would need the sum of the data, `sum`, and the number of entries in the data, `count`, and initialize both to 0 in the `reset()` method. - -```java -class AvgState implements State { - double sum; - - long count; - - @Override - public void reset() { - sum = 0; - count = 0; - } - - // other methods -} -``` - - - -##### byte[] serialize()/void deserialize(byte[] bytes) - -These methods serialize the `State` into binary data, and deserialize the `State` from the binary data. IoTDB, as a distributed database, involves passing data among different nodes, so you need to write these two methods to enable the passing of the State among different nodes. Note that the order of serialization and deserialization must be the consistent. - -In the case of `State` for averaging (aka `avg`), for example, you can convert the content of State to `byte[]` array and read out the content of State from `byte[]` array in any way you want, the following shows the code for serialization/deserialization using `ByteBuffer` introduced by Java8: - -```java -@Override -public byte[] serialize() { - ByteBuffer buffer = ByteBuffer.allocate(Double.BYTES + Long.BYTES); - buffer.putDouble(sum); - buffer.putLong(count); - - return buffer.array(); -} - -@Override -public void deserialize(byte[] bytes) { - ByteBuffer buffer = ByteBuffer.wrap(bytes); - sum = buffer.getDouble(); - count = buffer.getLong(); -} -``` - - - -#### UDAF Classes - -To write a UDAF, you need to implement the `org.apache.iotdb.udf.api.UDAF` interface. - -The following table shows all the interfaces available for user implementation. - -| Interface definition | Description | Required to Implement | -| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------- | -| `void validate(UDFParameterValidator validator) throws Exception` | This method is mainly used to validate `UDFParameters` and it is executed before `beforeStart(UDFParameters, UDTFConfigurations)` is called. | Optional | -| `void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception` | Initialization method that invokes user-defined initialization behavior before UDAF processes the input data. Unlike UDTF, configuration is of type `UDAFConfiguration`. | Required | -| `State createState()` | To create a `State` object, usually just call the default constructor and modify the default initial value as needed. | Required | -| `void addInput(State state, Column[] columns, BitMap bitMap)` | Update `State` object according to the incoming data `Column[]` in batch, note that last column `columns[columns.length - 1]` always represents the time column. In addition, `BitMap` represents the data that has been filtered out before, you need to manually determine whether the corresponding data has been filtered out when writing this method. | Required | -| `void combineState(State state, State rhs)` | Merge `rhs` state into `state` state. In a distributed scenario, the same set of data may be distributed on different nodes, IoTDB generates a `State` object for the partial data on each node, and then calls this method to merge it into the complete `State`. | Required | -| `void outputFinal(State state, ResultValue resultValue)` | Computes the final aggregated result based on the data in `State`. Note that according to the semantics of the aggregation, only one value can be output per group. | Required | -| `void beforeDestroy() ` | This method is called by the framework after the last input data is processed, and will only be called once in the life cycle of each UDF instance. | Optional | - -In the life cycle of a UDAF instance, the calling sequence of each method is as follows: - -1. `State createState()` -2. `void validate(UDFParameterValidator validator) throws Exception` -3. `void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception` -4. `void addInput(State state, Column[] columns, BitMap bitMap)` -5. `void combineState(State state, State rhs)` -6. `void outputFinal(State state, ResultValue resultValue)` -7. `void beforeDestroy()` - -Similar to UDTF, every time the framework executes a UDAF query, a new UDF instance will be constructed. When the query ends, the corresponding instance will be destroyed. Therefore, the internal data of the instances in different UDAF queries (even in the same SQL statement) are isolated. You can maintain some state data in the UDAF without considering the influence of concurrency and other factors. - -The usage of each interface will be described in detail below. - - - -##### void validate(UDFParameterValidator validator) throws Exception - -Same as UDTF, the `validate` method is used to validate the parameters entered by the user. - -In this method, you can limit the number and types of input time series, check the attributes of user input, or perform any custom verification. - - - -##### void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception - - The `beforeStart` method does the same thing as the UDAF: - -1. Use UDFParameters to get the time series paths and parse key-value pair attributes entered by the user. -2. Set the strategy to access the raw data and set the output data type in UDAFConfigurations. -3. Create resources, such as establishing external connections, opening files, etc. - -The role of the `UDFParameters` type can be seen above. - -###### UDAFConfigurations - -The difference from UDTF is that UDAF uses `UDAFConfigurations` as the type of `configuration` object. - -Currently, this class only supports setting the type of output data. - -```java -void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { - // parameters - // ... - - // configurations - configurations - .setOutputDataType(Type.INT32); } -} -``` - -The relationship between the output type set in `setOutputDataType` and the type of data output that `ResultValue` can actually receive is as follows: - -| The output type set in `setOutputDataType` | The output type that `ResultValue` can actually receive | -| ------------------------------------------ | ------------------------------------------------------- | -| `INT32` | `int` | -| `INT64` | `long` | -| `FLOAT` | `float` | -| `DOUBLE` | `double` | -| `BOOLEAN` | `boolean` | -| `TEXT` | `org.apache.iotdb.udf.api.type.Binary` | - -The output type of the UDAF is determined at runtime. You can dynamically determine the output sequence type based on the input type. - -Here is a simple example: - -```java -void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { - // do something - // ... - - configurations - .setOutputDataType(parameters.getDataType(0)); -} -``` - - - -##### State createState() - -This method creates and initializes a `State` object for UDAF. Due to the limitations of the Java language, you can only call the default constructor for the `State` class. The default constructor assigns a default initial value to all the fields in the class, and if that initial value does not meet your requirements, you need to initialize them manually within this method. - -The following is an example that includes manual initialization. Suppose you want to implement an aggregate function that multiply all numbers in the group, then your initial `State` value should be set to 1, but the default constructor initializes it to 0, so you need to initialize `State` manually after calling the default constructor: - -```java -public State createState() { - MultiplyState state = new MultiplyState(); - state.result = 1; - return state; -} -``` - - - -##### void addInput(State state, Column[] columns, BitMap bitMap) - -This method updates the `State` object with the raw input data. For performance reasons, also to align with the IoTDB vectorized query engine, the raw input data is no longer a data point, but an array of columns ``Column[]``. Note that the last column (i.e. `columns[columns.length - 1]`) is always the time column, so you can also do different operations in UDAF depending on the time. - -Since the input parameter is not of a single data point type, but of multiple columns, you need to manually filter some of the data in the columns, which is why the third parameter, `BitMap`, exists. It identifies which of these columns have been filtered out, so you don't have to think about the filtered data in any case. - -Here's an example of `addInput()` that counts the number of items (aka count). It shows how you can use `BitMap` to ignore data that has been filtered out. Note that due to the limitations of the Java language, you need to do the explicit cast the `State` object from type defined in the interface to a custom `State` type at the beginning of the method, otherwise you won't be able to use the `State` object. - -```java -public void addInput(State state, Column[] columns, BitMap bitMap) { - CountState countState = (CountState) state; - - int count = columns[0].getPositionCount(); - for (int i = 0; i < count; i++) { - if (bitMap != null && !bitMap.isMarked(i)) { - continue; - } - if (!columns[0].isNull(i)) { - countState.count++; - } - } -} -``` - - - -##### void combineState(State state, State rhs) - -This method combines two `State`s, or more precisely, updates the first `State` object with the second `State` object. IoTDB is a distributed database, and the data of the same group may be distributed on different nodes. For performance reasons, IoTDB will first aggregate some of the data on each node into `State`, and then merge the `State`s on different nodes that belong to the same group, which is what `combineState` does. - -Here's an example of `combineState()` for averaging (aka avg). Similar to `addInput`, you need to do an explicit type conversion for the two `State`s at the beginning. Also note that you are updating the value of the first `State` with the contents of the second `State`. - -```java -public void combineState(State state, State rhs) { - AvgState avgState = (AvgState) state; - AvgState avgRhs = (AvgState) rhs; - - avgState.count += avgRhs.count; - avgState.sum += avgRhs.sum; -} -``` - - - -##### void outputFinal(State state, ResultValue resultValue) - -This method works by calculating the final result from `State`. You need to access the various fields in `State`, derive the final result, and set the final result into the `ResultValue` object.IoTDB internally calls this method once at the end for each group. Note that according to the semantics of aggregation, the final result can only be one value. - -Here is another `outputFinal` example for averaging (aka avg). In addition to the forced type conversion at the beginning, you will also see a specific use of the `ResultValue` object, where the final result is set by `setXXX` (where `XXX` is the type name). - -```java -public void outputFinal(State state, ResultValue resultValue) { - AvgState avgState = (AvgState) state; - - if (avgState.count != 0) { - resultValue.setDouble(avgState.sum / avgState.count); - } else { - resultValue.setNull(); - } -} -``` - - - -##### void beforeDestroy() - -The method for terminating a UDF. - -This method is called by the framework. For a UDF instance, `beforeDestroy` will be called after the last record is processed. In the entire life cycle of the instance, `beforeDestroy` will only be called once. - - - -### Maven Project Example - -If you use Maven, you can build your own UDF project referring to our **udf-example** module. You can find the project [here](https://github.com/apache/iotdb/tree/master/example/udf). - - - -### UDF Registration - -The process of registering a UDF in IoTDB is as follows: - -1. Implement a complete UDF class, assuming the full class name of this class is `org.apache.iotdb.udf.ExampleUDTF`. -2. Package your project into a JAR. If you use Maven to manage your project, you can refer to the Maven project example above. -3. Make preparations for registration according to the registration mode. For details, see the following example. -4. You can use following SQL to register UDF. - -```sql -CREATE FUNCTION AS (USING URI URI-STRING)? -``` - -#### Example: register UDF named `example`, you can choose either of the following two registration methods - -##### No URI - -Prepare: -When use this method to register,you should put JAR to directory `iotdb-server-1.0.0-all-bin/ext/udf`(directory can config). -**Note,you should put JAR to this directory of all DataNodes if using Cluster** - -SQL: - -```sql -CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' -``` - -##### Using URI - -Prepare: -When use this method to register,you need to upload the JAR to URI server and ensure the IoTDB instance executing this registration statement has access to the URI server. -**Note,you needn't place JAR manually,IoTDB will download the JAR and sync it.** - -SQL: - -```sql -CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' -``` - -#### Note - -Since UDF instances are dynamically loaded through reflection technology, you do not need to restart the server during the UDF registration process. - -UDF function names are not case-sensitive. - -Please ensure that the function name given to the UDF is different from all built-in function names. A UDF with the same name as a built-in function cannot be registered. - -We recommend that you do not use classes that have the same class name but different function logic in different JAR packages. For example, in `UDF(UDAF/UDTF): udf1, udf2`, the JAR package of udf1 is `udf1.jar` and the JAR package of udf2 is `udf2.jar`. Assume that both JAR packages contain the `org.apache.iotdb.udf.ExampleUDTF` class. If you use two UDFs in the same SQL statement at the same time, the system will randomly load either of them and may cause inconsistency in UDF execution behavior. - -### UDF Deregistration - -The following shows the SQL syntax of how to deregister a UDF. - -```sql -DROP FUNCTION -``` - -Here is an example: - -```sql -DROP FUNCTION example -``` - - - -### UDF Queries - -The usage of UDF is similar to that of built-in aggregation functions. - - - -#### Basic SQL syntax support - -* Support `SLIMIT` / `SOFFSET` -* Support `LIMIT` / `OFFSET` -* Support queries with time filters -* Support queries with value filters - - -#### Queries with * in SELECT Clauses - -Assume that there are 2 time series (`root.sg.d1.s1` and `root.sg.d1.s2`) in the system. - -* **`SELECT example(*) from root.sg.d1`** - -Then the result set will include the results of `example (root.sg.d1.s1)` and `example (root.sg.d1.s2)`. - -* **`SELECT example(s1, *) from root.sg.d1`** - -Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)` and `example(root.sg.d1.s1, root.sg.d1.s2)`. - -* **`SELECT example(*, *) from root.sg.d1`** - -Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)`, `example(root.sg.d1.s2, root.sg.d1.s1)`, `example(root.sg.d1.s1, root.sg.d1.s2)` and `example(root.sg.d1.s2, root.sg.d1.s2)`. - - - -#### Queries with Key-value Attributes in UDF Parameters - -You can pass any number of key-value pair parameters to the UDF when constructing a UDF query. The key and value in the key-value pair need to be enclosed in single or double quotes. Note that key-value pair parameters can only be passed in after all time series have been passed in. Here is a set of examples: - -``` sql -SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; -SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; -``` - - - -#### Nested Queries - -``` sql -SELECT s1, s2, example(s1, s2) FROM root.sg.d1; -SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; -SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; -SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; -``` - - - -### Show All Registered UDFs - -``` sql -SHOW FUNCTIONS -``` - - - -### User Permission Management - -There are 1 types of user permissions related to UDF: `USE_UDF` - -* Only users with this permission are allowed to register UDFs -* Only users with this permission are allowed to deregister UDFs -* Only users with this permission are allowed to use UDFs for queries - -For more user permissions related content, please refer to [Account Management Statements](./Authority-Management.md). - - - -### Configurable Properties - -You can use `udf_lib_dir` to config udf lib directory. -When querying by a UDF, IoTDB may prompt that there is insufficient memory. You can resolve the issue by configuring `udf_initial_byte_array_length_for_memory_control`, `udf_memory_budget_in_mb` and `udf_reader_transformer_collector_memory_proportion` in `iotdb-datanode.properties` and restarting the server. - - - -### Contribute UDF - - - -This part mainly introduces how external users can contribute their own UDFs to the IoTDB community. - - - -#### Prerequisites - -1. UDFs must be universal. - - The "universal" mentioned here refers to: UDFs can be widely used in some scenarios. In other words, the UDF function must have reuse value and may be directly used by other users in the community. - - If you are not sure whether the UDF you want to contribute is universal, you can send an email to `dev@iotdb.apache.org` or create an issue to initiate a discussion. - -2. The UDF you are going to contribute has been well tested and can run normally in the production environment. - - - -#### What you need to prepare - -1. UDF source code -2. Test cases -3. Instructions - - - -##### UDF Source Code - -1. Create the UDF main class and related classes in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin` or in its subfolders. -2. Register your UDF in `iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`. - - - -##### Test Cases - -At a minimum, you need to write integration tests for the UDF. - -You can add a test class in `integration-test/src/test/java/org/apache/iotdb/db/it/udf`. - - - -##### Instructions - -The instructions need to include: the name and the function of the UDF, the attribute parameters that must be provided when the UDF is executed, the applicable scenarios, and the usage examples, etc. - -The instructions should be added in `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md`. - - - -#### Submit a PR - -When you have prepared the UDF source code, test cases, and instructions, you are ready to submit a Pull Request (PR) on [Github](https://github.com/apache/iotdb). You can refer to our code contribution guide to submit a PR: [Development Guide](https://iotdb.apache.org/Community/Development-Guide.html). - -### Known Implementations - -#### Built-in UDF - -1. Aggregate Functions, such as `SUM`. For details and examples, see the document [Aggregate Functions](../Reference/Function-and-Expression.md#aggregate-functions). -2. Arithmetic Functions, such as `SIN`. For details and examples, see the document [Arithmetic Operators and Functions](../Reference/Function-and-Expression.md#arithmetic-operators-and-functions). -3. Comparison Functions, such as `ON_OFF`. For details and examples, see the document [Comparison Operators and Functions](../Reference/Function-and-Expression.md#comparison-operators-and-functions). -4. String Processing Functions, such as `STRING_CONTAINS`. For details and examples, see the document [String Processing](../Reference/Function-and-Expression.md#string-processing). -5. Data Type Conversion Function, such as `CAST`. For details and examples, see the document [Data Type Conversion Function](../Reference/Function-and-Expression.md#data-type-conversion-function). -6. Constant Timeseries Generating Functions, such as `CONST`. For details and examples, see the document [Constant Timeseries Generating Functions](../Reference/Function-and-Expression.md#constant-timeseries-generating-functions). -7. Selector Functions, such as `TOP_K`. For details and examples, see the document [Selector Functions](../Reference/Function-and-Expression.md#selector-functions). -8. Continuous Interval Functions, such as `ZERO_DURATION`. For details and examples, see the document [Continuous Interval Functions](../Reference/Function-and-Expression.md#continuous-interval-functions). -9. Variation Trend Calculation Functions, such as `TIME_DIFFERENCE`. For details and examples, see the document [Variation Trend Calculation Functions](../Reference/Function-and-Expression.md#variation-trend-calculation-functions). -10. Sample Functions, such as `M4`. For details and examples, see the document [Sample Functions](../Reference/Function-and-Expression.md#sample-functions). -11. Change Points Function, such as `CHANGE_POINTS`. For details and examples, see the document [Time-Series](../Reference/Function-and-Expression.md#time-series-processing). - -#### Data Quality Function Library - -##### About - -For applications based on time series data, data quality is vital. **UDF Library** is IoTDB User Defined Functions (UDF) about data quality, including data profiling, data quality evalution and data repairing. It effectively meets the demand for data quality in the industrial field. - -##### Quick Start - -The functions in this function library are not built-in functions, and must be loaded into the system before use. - -1. [Download](https://archive.apache.org/dist/iotdb/1.0.1/apache-iotdb-1.0.1-library-udf-bin.zip) the JAR with all dependencies and the script of registering UDF. -2. Copy the JAR package to `ext\udf` under the directory of IoTDB system (Please put JAR to this directory of all DataNodes if you use Cluster). -3. Run `sbin\start-server.bat` (for Windows) or `sbin\start-server.sh` (for Linux or MacOS) to start IoTDB server. -4. Copy the script to the directory of IoTDB system (under the root directory, at the same level as `sbin`), modify the parameters in the script if needed and run it to register UDF. - -##### Implemented Functions - -1. Data Quality related functions, such as `Completeness`. For details and examples, see the document [Data-Quality](../Reference/UDF-Libraries.md#data-quality). -2. Data Profiling related functions, such as `ACF`. For details and examples, see the document [Data-Profiling](../Reference/UDF-Libraries.md#data-profiling). -3. Anomaly Detection related functions, such as `IQR`. For details and examples, see the document [Anomaly-Detection](../Reference/UDF-Libraries.md#anomaly-detection). -4. Frequency Domain Analysis related functions, such as `Conv`. For details and examples, see the document [Frequency-Domain](../Reference/UDF-Libraries.md#frequency-domain-analysis). -5. Data Matching related functions, such as `DTW`. For details and examples, see the document [Data-Matching](../Reference/UDF-Libraries.md#data-matching). -6. Data Repairing related functions, such as `TimestampRepair`. For details and examples, see the document [Data-Repairing](../Reference/UDF-Libraries.md#data-repairing). -7. Series Discovery related functions, such as `ConsecutiveSequences`. For details and examples, see the document [Series-Discovery](../Reference/UDF-Libraries.md#series-discovery). -8. Machine Learning related functions, such as `AR`. For details and examples, see the document [Machine-Learning](../Reference/UDF-Libraries.md#machine-learning). - - -### Q&A - -Q1: How to modify the registered UDF? - -A1: Assume that the name of the UDF is `example` and the full class name is `org.apache.iotdb.udf.ExampleUDTF`, which is introduced by `example.jar`. - -1. Unload the registered function by executing `DROP FUNCTION example`. -2. Delete `example.jar` under `iotdb-server-1.0.0-all-bin/ext/udf`. -3. Modify the logic in `org.apache.iotdb.udf.ExampleUDTF` and repackage it. The name of the JAR package can still be `example.jar`. -4. Upload the new JAR package to `iotdb-server-1.0.0-all-bin/ext/udf`. -5. Load the new UDF by executing `CREATE FUNCTION example AS "org.apache.iotdb.udf.ExampleUDTF"`. diff --git a/src/UserGuide/latest/User-Manual/User-defined-function.md b/src/UserGuide/latest/User-Manual/User-defined-function.md new file mode 100644 index 000000000..183d01dac --- /dev/null +++ b/src/UserGuide/latest/User-Manual/User-defined-function.md @@ -0,0 +1,213 @@ +# USER-DEFINED FUNCTION (UDF) + +## 1. UDF Introduction + +UDF (User Defined Function) refers to user-defined functions. IoTDB provides a variety of built-in time series processing functions and also supports extending custom functions to meet more computing needs. + +In IoTDB, you can expand two types of UDF: + + + + + + + + + + + + + + + + + + + + + +
UDF ClassAccessStrategyDescription
UDTFMAPPABLE_ROW_BY_ROWCustom scalar function, input k columns of time series and 1 row of data, output 1 column of time series and 1 row of data, can be used in any clause and expression that appears in the scalar function, such as select clause, where clause, etc.
ROW_BY_ROW
SLIDING_TIME_WINDOW
SLIDING_SIZE_WINDOW
SESSION_TIME_WINDOW
STATE_WINDOW
Custom time series generation function, input k columns of time series m rows of data, output 1 column of time series n rows of data, the number of input rows m can be different from the number of output rows n, and can only be used in SELECT clauses.
UDAF-Custom aggregation function, input k columns of time series m rows of data, output 1 column of time series 1 row of data, can be used in any clause and expression that appears in the aggregation function, such as select clause, having clause, etc.
+ +### 1.1 UDF usage + +The usage of UDF is similar to that of regular built-in functions, and can be directly used in SELECT statements like calling regular functions. + +#### 1.Basic SQL syntax support + +* Support `SLIMIT` / `SOFFSET` +* Support `LIMIT` / `OFFSET` +* Support queries with value filters +* Support queries with time filters + + +#### 2. Queries with * in SELECT Clauses + +Assume that there are 2 time series (`root.sg.d1.s1` and `root.sg.d1.s2`) in the system. + +* **`SELECT example(*) from root.sg.d1`** + +Then the result set will include the results of `example (root.sg.d1.s1)` and `example (root.sg.d1.s2)`. + +* **`SELECT example(s1, *) from root.sg.d1`** + +Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)` and `example(root.sg.d1.s1, root.sg.d1.s2)`. + +* **`SELECT example(*, *) from root.sg.d1`** + +Then the result set will include the results of `example(root.sg.d1.s1, root.sg.d1.s1)`, `example(root.sg.d1.s2, root.sg.d1.s1)`, `example(root.sg.d1.s1, root.sg.d1.s2)` and `example(root.sg.d1.s2, root.sg.d1.s2)`. + +#### 3. Queries with Key-value Attributes in UDF Parameters + +You can pass any number of key-value pair parameters to the UDF when constructing a UDF query. The key and value in the key-value pair need to be enclosed in single or double quotes. Note that key-value pair parameters can only be passed in after all time series have been passed in. Here is a set of examples: + + Example: +``` sql +SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; +SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; +``` + +#### 4. Nested Queries + + Example: +``` sql +SELECT s1, s2, example(s1, s2) FROM root.sg.d1; +SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; +SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; +SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; +``` + +## 2. UDF Development + +You can refer to UDF development:[Development Guide](../Reference/UDF-development.md) + +## 3. UDF management + +### 3.1 UDF Registration + +The process of registering a UDF in IoTDB is as follows: + +1. Implement a complete UDF class, assuming the full class name of this class is `org.apache.iotdb.udf.ExampleUDTF`. +2. Convert the project into a JAR package. If using Maven to manage the project, you can refer to the [Maven project example](https://github.com/apache/iotdb/tree/master/example/udf) above. +3. Make preparations for registration according to the registration mode. For details, see the following example. +4. You can use following SQL to register UDF. + +```sql +CREATE FUNCTION AS (USING URI URI-STRING) +``` + +#### Example: register UDF named `example`, you can choose either of the following two registration methods + +#### Method 1: Manually place the jar package + +Prepare: +When registering using this method, it is necessary to place the JAR package in advance in the 'ext/udf' directory of all DataNodes in the cluster (which can be configured). + +Registration statement: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' +``` + +#### Method 2: Cluster automatically installs jar packages through URI + +Prepare: +When registering using this method, it is necessary to upload the JAR package to the URI server in advance and ensure that the IoTDB instance executing the registration statement can access the URI server. + +Registration statement: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' +``` + +IoTDB will download JAR packages and synchronize them to the entire cluster. + +#### Note + +1. Since UDF instances are dynamically loaded through reflection technology, you do not need to restart the server during the UDF registration process. + +2. UDF function names are not case-sensitive. + +3. Please ensure that the function name given to the UDF is different from all built-in function names. A UDF with the same name as a built-in function cannot be registered. + +4. We recommend that you do not use classes that have the same class name but different function logic in different JAR packages. For example, in `UDF(UDAF/UDTF): udf1, udf2`, the JAR package of udf1 is `udf1.jar` and the JAR package of udf2 is `udf2.jar`. Assume that both JAR packages contain the `org.apache.iotdb.udf.ExampleUDTF` class. If you use two UDFs in the same SQL statement at the same time, the system will randomly load either of them and may cause inconsistency in UDF execution behavior. + +### 3.2 UDF Deregistration + +The SQL syntax is as follows: + +```sql +DROP FUNCTION +``` + +Example: Uninstall the UDF from the above example: + +```sql +DROP FUNCTION example +``` + + + +### 3.3 Show All Registered UDFs + +``` sql +SHOW FUNCTIONS +``` + +### 3.4 UDF configuration + +- UDF configuration allows configuring the storage directory of UDF in `iotdb-common.properties` + ``` Properties +# UDF lib dir + +udf_lib_dir=ext/udf +``` + +- -When using custom functions, there is a message indicating insufficient memory. Change the following configuration parameters in `iotdb-common.properties` and restart the service. + + ``` Properties + +# Used to estimate the memory usage of text fields in a UDF query. +# It is recommended to set this value to be slightly larger than the average length of all text +# effectiveMode: restart +# Datatype: int +udf_initial_byte_array_length_for_memory_control=48 + +# How much memory may be used in ONE UDF query (in MB). +# The upper limit is 20% of allocated memory for read. +# effectiveMode: restart +# Datatype: float +udf_memory_budget_in_mb=30.0 + +# UDF memory allocation ratio. +# The parameter form is a:b:c, where a, b, and c are integers. +# effectiveMode: restart +udf_reader_transformer_collector_memory_proportion=1:1:1 +``` + +### 3.5 UDF User Permissions + + +When users use UDF, they will be involved in the `USE_UDF` permission, and only users with this permission are allowed to perform UDF registration, uninstallation, and query operations. + +For more user permissions related content, please refer to [Account Management Statements](./Authority-Management.md). + + +## 4. UDF Libraries + +Based on the ability of user-defined functions, IoTDB provides a series of functions for temporal data processing, including data quality, data profiling, anomaly detection, frequency domain analysis, data matching, data repairing, sequence discovery, machine learning, etc., which can meet the needs of industrial fields for temporal data processing. + +You can refer to the [UDF Libraries](../Reference/UDF-Libraries.md)document to find the installation steps and registration statements for each function, to ensure that all required functions are registered correctly. + + +## 5. Common problem: + +Q1: How to modify the registered UDF? + +A1: Assume that the name of the UDF is `example` and the full class name is `org.apache.iotdb.udf.ExampleUDTF`, which is introduced by `example.jar`. + +1. Unload the registered function by executing `DROP FUNCTION example`. +2. Delete `example.jar` under `iotdb-server-1.0.0-all-bin/ext/udf`. +3. Modify the logic in `org.apache.iotdb.udf.ExampleUDTF` and repackage it. The name of the JAR package can still be `example.jar`. +4. Upload the new JAR package to `iotdb-server-1.0.0-all-bin/ext/udf`. +5. Load the new UDF by executing `CREATE FUNCTION example AS "org.apache.iotdb.udf.ExampleUDTF"`. + diff --git a/src/zh/UserGuide/Master/Reference/UDF-Libraries.md b/src/zh/UserGuide/Master/Reference/UDF-Libraries.md index cbd829937..5b323ff69 100644 --- a/src/zh/UserGuide/Master/Reference/UDF-Libraries.md +++ b/src/zh/UserGuide/Master/Reference/UDF-Libraries.md @@ -20,10 +20,36 @@ --> # UDF函数库 -## 数据质量 +基于用户自定义函数能力,IoTDB 提供了一系列关于时序数据处理的函数,包括数据质量、数据画像、异常检测、 频域分析、数据匹配、数据修复、序列发现、机器学习等,能够满足工业领域对时序数据处理的需求。 + +## 安装步骤 +1. 请获取与 IoTDB 版本兼容的 UDF 函数库 JAR 包的压缩包。 + + | UDF 函数库版本 | 支持的 IoTDB 版本 | 下载链接 | + | --------------- | ----------------- | ------------------------------------------------------------ | + | UDF-1.3.3.zip | V1.3.3及以上 | [压缩包](https://alioss.timecho.com/upload/UDF-1.3.3.zip) | + | UDF-1.3.2.zip | V1.0.0~V1.3.2 | [压缩包](https://alioss.timecho.com/upload/UDF-1.3.2.zip) | + +2. 将获取的压缩包中的 library-udf.jar 文件放置在IoTDB中 `/iotdb-enterprise-x.x.x.x-bin/ext/udf` 的路径下 +3. 在 IoTDB 的 SQL 命令行终端(CLI)或可视化控制台(Workbench)的 SQL 操作界面中,执行下述相应的函数注册语句。 +4. 批量注册:两种注册方式:注册脚本 或 SQL汇总语句 +- 注册脚本 + - 将压缩包中的注册脚本(register-UDF.sh 或 register-UDF.bat)按需复制到 IoTDB 的 tools 目录下,修改脚本中的参数(默认为host=127.0.0.1,rpcPort=6667,user=root,pass=root); + - 启动 IoTDB 服务,运行注册脚本批量注册 UDF +- SQL汇总语句 + - 打开压缩包中的SQl文件,复制全部 SQL 语句,在 IoTDB 的 SQL 命令行终端(CLI)或可视化控制台(Workbench)的 SQL 操作界面中,执行全部 SQl 语句批量注册 UDF + +## 数据质量 + ### Completeness +#### 注册语句 + +```sql +create function completeness as 'org.apache.iotdb.library.dquality.UDTFCompleteness' +``` + #### 函数简介 本函数用于计算时间序列的完整性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的完整性,并输出窗口第一个数据点的时间戳和窗口的完整性。 @@ -150,6 +176,12 @@ select completeness(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 ### Consistency +#### 注册语句 + +```sql +create function consistency as 'org.apache.iotdb.library.dquality.UDTFConsistency' +``` + #### 函数简介 本函数用于计算时间序列的一致性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的一致性,并输出窗口第一个数据点的时间戳和窗口的时效性。 @@ -275,6 +307,12 @@ select consistency(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 ### Timeliness +#### 注册语句 + +```sql +create function timeliness as 'org.apache.iotdb.library.dquality.UDTFTimeliness' +``` + #### 函数简介 本函数用于计算时间序列的时效性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的时效性,并输出窗口第一个数据点的时间戳和窗口的时效性。 @@ -400,6 +438,12 @@ select timeliness(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 0 ### Validity +#### 注册语句 + +```sql +create function validity as 'org.apache.iotdb.library.dquality.UDTFValidity' +``` + #### 函数简介 本函数用于计算时间序列的有效性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的有效性,并输出窗口第一个数据点的时间戳和窗口的有效性。 @@ -524,6 +568,7 @@ select validity(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 00: +-----------------------------+----------------------------------------+ ``` + - -该部分主要讲述了外部用户如何将自己编写的 UDF 贡献给 IoTDB 社区。 - -#### 前提条件 - -1. UDF 具有通用性。 - - 通用性主要指的是:UDF 在某些业务场景下,可以被广泛使用。换言之,就是 UDF 具有复用价值,可被社区内其他用户直接使用。 - - 如果您不确定自己写的 UDF 是否具有通用性,可以发邮件到 `dev@iotdb.apache.org` 或直接创建 ISSUE 发起讨论。 - -2. UDF 已经完成测试,且能够正常运行在用户的生产环境中。 - -#### 贡献清单 - -1. UDF 的源代码 -2. UDF 的测试用例 -3. UDF 的使用说明 - -##### 源代码 - -1. 在`iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin`中创建 UDF 主类和相关的辅助类。 -2. 在`iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`中注册您编写的 UDF。 - -##### 测试用例 - -您至少需要为您贡献的 UDF 编写集成测试。 - -您可以在`integration-test/src/test/java/org/apache/iotdb/db/it/udf`中为您贡献的 UDF 新增一个测试类进行测试。 - -##### 使用说明 - -使用说明需要包含:UDF 的名称、UDF 的作用、执行函数必须的属性参数、函数的适用的场景以及使用示例等。 - -使用说明需包含中英文两个版本。应分别在 `docs/zh/UserGuide/Operation Manual/DML Data Manipulation Language.md` 和 `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md` 中新增使用说明。 - -#### 提交 PR - -当您准备好源代码、测试用例和使用说明后,就可以将 UDF 贡献到 IoTDB 社区了。在 [Github](https://github.com/apache/iotdb) 上面提交 Pull Request (PR) 即可。具体提交方式见:[贡献指南](https://iotdb.apache.org/zh/Community/Development-Guide.html)。 - -当 PR 评审通过并被合并后,您的 UDF 就已经贡献给 IoTDB 社区了! - -### 已知实现的UDF - -#### 内置UDF - -1. [Aggregate Functions](../Reference/Function-and-Expression.md#聚合函数) 聚合函数 -2. [Arithmetic Operators and Functions](../Reference/Function-and-Expression.md#算数运算符) 算数函数 -3. [Comparison Operators and Functions](../Reference/Function-and-Expression.md#比较运算符和函数) 比较函数 -4. [String Processing](../Reference/Function-and-Expression.md#字符串处理) 字符串处理函数 -5. [Data Type Conversion Function](../Reference/Function-and-Expression.md#数据类型转换) 数据类型转换函数 -6. [Constant Timeseries Generating Functions](../Reference/Function-and-Expression.md#常序列生成函数) 常序列生成函数 -7. [Selector Functions](../Reference/Function-and-Expression.md#选择函数) 选择函数 -8. [Continuous Interval Functions](../Reference/Function-and-Expression.md#区间查询函数) 区间查询函数 -9. [Variation Trend Calculation Functions](../Reference/Function-and-Expression.md#趋势计算函数) 趋势计算函数 -10. [Sample Functions](../Reference/Function-and-Expression.md#采样函数) 采样函数 -11. [Time-Series](../Reference/Function-and-Expression.md#时间序列处理) 时间序列处理函数 - -#### 数据质量函数库 - -##### 关于 - -对基于时序数据的应用而言,数据质量至关重要。基于用户自定义函数能力,IoTDB 提供了一系列关于数据质量的函数,包括数据画像、数据质量评估与修复等,能够满足工业领域对数据质量的需求。 - -##### 快速上手 - -**该函数库中的函数不是内置函数,使用前要先加载到系统中。** 操作流程如下: - -1. 在 iotdb 根目录下执行编译指令; - ``` - mvn clean package -pl library-udf -am -DskipTests -Pget-jar-with-dependencies - ``` -2. 将在 target 下生成的带依赖的 jar 包复制到 IoTDB 程序目录的 `ext\udf` 目录下(若您使用的是集群,请将jar包复制到所有DataNode的该目录下),如下图所示; -![](https://alioss.timecho.com/docs/img/20230814-191908.jpg) -3. 下载注册脚本:[linux](https://alioss.timecho.com/docs/img/register-UDF.sh), [windows](https://alioss.timecho.com/docs/img/register-UDF.bat); -4. 将注册脚本复制到 IoTDB 的`sbin`目录下,修改脚本中的参数(默认为host=127.0.0.1,rpcPort=6667,user=root,pass=root); -5. 启动 IoTDB 服务; -6. 运行注册脚本`register-UDF.sh`以注册 UDF。 - -##### 已经实现的函数 - -1. [Data-Quality](../Reference/UDF-Libraries.md#数据质量) 数据质量 -2. [Data-Profiling](../Reference/UDF-Libraries.md#数据画像) 数据画像 -3. [Anomaly-Detection](../Reference/UDF-Libraries.md#异常检测) 异常检测 -4. [Frequency-Domain](../Reference/UDF-Libraries.md#频域分析) 频域分析 -5. [Data-Matching](../Reference/UDF-Libraries.md#数据匹配) 数据匹配 -6. [Data-Repairing](../Reference/UDF-Libraries.md#数据修复) 数据修复 -7. [Series-Discovery](../Reference/UDF-Libraries.md#序列发现) 序列发现 -8. [Machine-Learning](../Reference/UDF-Libraries.md#机器学习) 机器学习 - -### Q&A - -Q1: 如何修改已经注册的 UDF? - -A1: 假设 UDF 的名称为`example`,全类名为`org.apache.iotdb.udf.UDTFExample`,由`example.jar`引入 - -1. 首先卸载已经注册的`example`函数,执行`DROP FUNCTION example` -2. 删除 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下的`example.jar` -3. 修改`org.apache.iotdb.udf.UDTFExample`中的逻辑,重新打包,JAR 包的名字可以仍然为`example.jar` -4. 将新的 JAR 包上传至 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下 -5. 装载新的 UDF,执行`CREATE FUNCTION example AS "org.apache.iotdb.udf.UDTFExample"` \ No newline at end of file +| `continuous_query_min_every_interval_in_ms` | 系统允许的连续查询最小的周期性时间间隔 | duration | 1000 | \ No newline at end of file diff --git a/src/zh/UserGuide/Master/User-Manual/User-defined-function.md b/src/zh/UserGuide/Master/User-Manual/User-defined-function.md new file mode 100644 index 000000000..c49c960a9 --- /dev/null +++ b/src/zh/UserGuide/Master/User-Manual/User-defined-function.md @@ -0,0 +1,209 @@ +# 用户自定义函数 + +## 1. UDF 介绍 + +UDF(User Defined Function)即用户自定义函数,IoTDB 提供多种内建的面向时序处理的函数,也支持扩展自定义函数来满足更多的计算需求。 + +IoTDB 支持两种类型的 UDF 函数,如下表所示。 + + + + + + + + + + + + + + + + + + + + + +
UDF 分类数据访问策略描述
UDTFMAPPABLE_ROW_BY_ROW自定义标量函数,输入 k 列时间序列 1 行数据,输出 1 列时间序列 1 行数据,可用于标量函数出现的任何子句和表达式中,如select子句、where子句等。
ROW_BY_ROW
SLIDING_TIME_WINDOW
SLIDING_SIZE_WINDOW
SESSION_TIME_WINDOW
STATE_WINDOW
自定义时间序列生成函数,输入 k 列时间序列 m 行数据,输出 1 列时间序列 n 行数据,输入行数 m 可以与输出行数 n 不相同,只能用于SELECT子句中。
UDAF-自定义聚合函数,输入 k 列时间序列 m 行数据,输出 1 列时间序列 1 行数据,可用于聚合函数出现的任何子句和表达式中,如select子句、having子句等。
+ +### 1.1 UDF 使用 + +UDF 的使用方法与普通内建函数类似,可以直接在 SELECT 语句中像调用普通函数一样使用UDF。 + +#### 1.支持的基础 SQL 语法 + +* `SLIMIT` / `SOFFSET` +* `LIMIT` / `OFFSET` +* 支持值过滤 +* 支持时间过滤 + + +#### 2. 带 * 查询 + +假定现在有时间序列 `root.sg.d1.s1`和 `root.sg.d1.s2`。 + +* **执行`SELECT example(*) from root.sg.d1`** + +那么结果集中将包括`example(root.sg.d1.s1)`和`example(root.sg.d1.s2)`的结果。 + +* **执行`SELECT example(s1, *) from root.sg.d1`** + +那么结果集中将包括`example(root.sg.d1.s1, root.sg.d1.s1)`和`example(root.sg.d1.s1, root.sg.d1.s2)`的结果。 + +* **执行`SELECT example(*, *) from root.sg.d1`** + +那么结果集中将包括`example(root.sg.d1.s1, root.sg.d1.s1)`,`example(root.sg.d1.s2, root.sg.d1.s1)`,`example(root.sg.d1.s1, root.sg.d1.s2)` 和 `example(root.sg.d1.s2, root.sg.d1.s2)`的结果。 + +#### 3. 带自定义输入参数的查询 + +可以在进行 UDF 查询的时候,向 UDF 传入任意数量的键值对参数。键值对中的键和值都需要被单引号或者双引号引起来。注意,键值对参数只能在所有时间序列后传入。下面是一组例子: + + 示例: +``` sql +SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; +SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; +``` + +#### 4. 与其他查询的嵌套查询 + + 示例: +``` sql +SELECT s1, s2, example(s1, s2) FROM root.sg.d1; +SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; +SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; +SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; +``` + + +## 2. UDF 开发 + +可以参考 UDF函数开发:[开发指导](../Reference/UDF-development.md) + +## 3. UDF 管理 + +### 3.1 UDF 注册 + +注册一个 UDF 可以按如下流程进行: + +1. 实现一个完整的 UDF 类,假定这个类的全类名为`org.apache.iotdb.udf.UDTFExample` +2. 将项目打成 JAR 包,如果使用 Maven 管理项目,可以参考 [Maven 项目示例](https://github.com/apache/iotdb/tree/master/example/udf)的写法 +3. 进行注册前的准备工作,根据注册方式的不同需要做不同的准备,具体可参考以下例子 +4. 使用以下 SQL 语句注册 UDF + +```sql +CREATE FUNCTION AS (USING URI URI-STRING) +``` + +#### 示例:注册名为`example`的 UDF,以下两种注册方式任选其一即可 + +#### 方式一:手动放置jar包 + +准备工作: +使用该种方式注册时,需要提前将 JAR 包放置到集群所有 DataNode 的 `ext/udf`目录下(该目录可配置)。 + +注册语句: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' +``` + +#### 方式二:集群通过URI自动安装jar包 + +准备工作: +使用该种方式注册时,需要提前将 JAR 包上传到 URI 服务器上并确保执行注册语句的 IoTDB 实例能够访问该 URI 服务器。 + +注册语句: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' +``` + +IoTDB 会下载 JAR 包并同步到整个集群。 + +#### 注意 + +1. 由于 IoTDB 的 UDF 是通过反射技术动态装载的,因此在装载过程中无需启停服务器。 + +2. UDF 函数名称是大小写不敏感的。 + +3. 请不要给 UDF 函数注册一个内置函数的名字。使用内置函数的名字给 UDF 注册会失败。 + +4. 不同的 JAR 包中最好不要有全类名相同但实现功能逻辑不一样的类。例如 UDF(UDAF/UDTF):`udf1`、`udf2`分别对应资源`udf1.jar`、`udf2.jar`。如果两个 JAR 包里都包含一个`org.apache.iotdb.udf.UDTFExample`类,当同一个 SQL 中同时使用到这两个 UDF 时,系统会随机加载其中一个类,导致 UDF 执行行为不一致。 + +### 3.2 UDF 卸载 + +SQL 语法如下: + +```sql +DROP FUNCTION +``` + +示例:卸载上述例子的 UDF: + +```sql +DROP FUNCTION example +``` + + +### 3.3 查看所有注册的 UDF + +``` sql +SHOW FUNCTIONS +``` + +### 3.4 UDF 配置 + +- 允许在 `iotdb-system.properties` 中配置 udf 的存储目录.: + ``` Properties +# UDF lib dir + +udf_lib_dir=ext/udf +``` + +- 使用自定义函数时,提示内存不足,更改 `iotdb-system.properties` 中下述配置参数并重启服务。 + ``` Properties + +# Used to estimate the memory usage of text fields in a UDF query. +# It is recommended to set this value to be slightly larger than the average length of all text +# effectiveMode: restart +# Datatype: int +udf_initial_byte_array_length_for_memory_control=48 + +# How much memory may be used in ONE UDF query (in MB). +# The upper limit is 20% of allocated memory for read. +# effectiveMode: restart +# Datatype: float +udf_memory_budget_in_mb=30.0 + +# UDF memory allocation ratio. +# The parameter form is a:b:c, where a, b, and c are integers. +# effectiveMode: restart +udf_reader_transformer_collector_memory_proportion=1:1:1 +``` + +### 3.5 UDF 用户权限 + +用户在使用 UDF 时会涉及到 `USE_UDF` 权限,具备该权限的用户才被允许执行 UDF 注册、卸载和查询操作。 + +更多用户权限相关的内容,请参考 [权限管理语句](./Authority-Management.md##权限管理)。 + + +## 4. UDF 函数库 + +基于用户自定义函数能力,IoTDB 提供了一系列关于时序数据处理的函数,包括数据质量、数据画像、异常检测、 频域分析、数据匹配、数据修复、序列发现、机器学习等,能够满足工业领域对时序数据处理的需求。 + +可以参考 [UDF 函数库](../Reference/UDF-Libraries.md)文档,查找安装步骤及每个函数对应的注册语句,以确保正确注册所有需要的函数。 + +## 5. 常见问题: + +1. 如何修改已经注册的 UDF? + +答:假设 UDF 的名称为`example`,全类名为`org.apache.iotdb.udf.UDTFExample`,由`example.jar`引入 + +1. 首先卸载已经注册的`example`函数,执行`DROP FUNCTION example` +2. 删除 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下的`example.jar` +3. 修改`org.apache.iotdb.udf.UDTFExample`中的逻辑,重新打包,JAR 包的名字可以仍然为`example.jar` +4. 将新的 JAR 包上传至 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下 +5. 装载新的 UDF,执行`CREATE FUNCTION example AS "org.apache.iotdb.udf.UDTFExample"` \ No newline at end of file diff --git a/src/zh/UserGuide/latest/Reference/UDF-Libraries.md b/src/zh/UserGuide/latest/Reference/UDF-Libraries.md index e587777eb..7ec9e1a73 100644 --- a/src/zh/UserGuide/latest/Reference/UDF-Libraries.md +++ b/src/zh/UserGuide/latest/Reference/UDF-Libraries.md @@ -20,10 +20,36 @@ --> # UDF函数库 +基于用户自定义函数能力,IoTDB 提供了一系列关于时序数据处理的函数,包括数据质量、数据画像、异常检测、 频域分析、数据匹配、数据修复、序列发现、机器学习等,能够满足工业领域对时序数据处理的需求。 + +## 安装步骤 +1. 请获取与 IoTDB 版本兼容的 UDF 函数库 JAR 包的压缩包。 + + | UDF 函数库版本 | 支持的 IoTDB 版本 | 下载链接 | + | --------------- | ----------------- | ------------------------------------------------------------ | + | UDF-1.3.3.zip | V1.3.3及以上 | [压缩包](https://alioss.timecho.com/upload/UDF-1.3.3.zip) | + | UDF-1.3.2.zip | V1.0.0~V1.3.2 | [压缩包](https://alioss.timecho.com/upload/UDF-1.3.2.zip) | + +2. 将获取的压缩包中的 library-udf.jar 文件放置在IoTDB中 `/iotdb-enterprise-x.x.x.x-bin/ext/udf` 的路径下 +3. 在 IoTDB 的 SQL 命令行终端(CLI)或可视化控制台(Workbench)的 SQL 操作界面中,执行下述相应的函数注册语句。 +4. 批量注册:两种注册方式:注册脚本 或 SQL汇总语句 +- 注册脚本 + - 将压缩包中的注册脚本(register-UDF.sh 或 register-UDF.bat)按需复制到 IoTDB 的 tools 目录下,修改脚本中的参数(默认为host=127.0.0.1,rpcPort=6667,user=root,pass=root); + - 启动 IoTDB 服务,运行注册脚本批量注册 UDF + +- SQL汇总语句 + - 打开压缩包中的SQl文件,复制全部 SQL 语句,在 IoTDB 的 SQL 命令行终端(CLI)或可视化控制台(Workbench)的 SQL 操作界面中,执行全部 SQl 语句批量注册 UDF + ## 数据质量 ### Completeness +#### 注册语句 + +```sql +create function completeness as 'org.apache.iotdb.library.dquality.UDTFCompleteness' +``` + #### 函数简介 本函数用于计算时间序列的完整性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的完整性,并输出窗口第一个数据点的时间戳和窗口的完整性。 @@ -150,6 +176,12 @@ select completeness(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 ### Consistency +#### 注册语句 + +```sql +create function consistency as 'org.apache.iotdb.library.dquality.UDTFConsistency' +``` + #### 函数简介 本函数用于计算时间序列的一致性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的一致性,并输出窗口第一个数据点的时间戳和窗口的时效性。 @@ -275,6 +307,12 @@ select consistency(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 ### Timeliness +#### 注册语句 + +```sql +create function timeliness as 'org.apache.iotdb.library.dquality.UDTFTimeliness' +``` + #### 函数简介 本函数用于计算时间序列的时效性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的时效性,并输出窗口第一个数据点的时间戳和窗口的时效性。 @@ -400,6 +438,12 @@ select timeliness(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 0 ### Validity +#### 注册语句 + +```sql +create function validity as 'org.apache.iotdb.library.dquality.UDTFValidity' +``` + #### 函数简介 本函数用于计算时间序列的有效性。将输入序列划分为若干个连续且不重叠的窗口,分别计算每一个窗口的有效性,并输出窗口第一个数据点的时间戳和窗口的有效性。 @@ -550,6 +594,12 @@ select validity(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 00: ### ACF +#### 注册语句 + +```sql +create function acf as 'org.apache.iotdb.library.dprofile.UDTFACF' +``` + #### 函数简介 本函数用于计算时间序列的自相关函数值,即序列与自身之间的互相关函数。 @@ -607,6 +657,12 @@ select acf(s1) from root.test.d1 where time <= 2020-01-01 00:00:05 ### Distinct +#### 注册语句 + +```sql +create function distinct as 'org.apache.iotdb.library.dprofile.UDTFDistinct' +``` + #### 函数简介 本函数可以返回输入序列中出现的所有不同的元素。 @@ -660,6 +716,12 @@ select distinct(s2) from root.test.d2 ### Histogram +#### 注册语句 + +```sql +create function histogram as 'org.apache.iotdb.library.dprofile.UDTFHistogram' +``` + #### 函数简介 本函数用于计算单列数值型数据的分布直方图。 @@ -740,6 +802,12 @@ select histogram(s1,"min"="1","max"="20","count"="10") from root.test.d1 ### Integral +#### 注册语句 + +```sql +create function integral as 'org.apache.iotdb.library.dprofile.UDAFIntegral' +``` + #### 函数简介 本函数用于计算时间序列的数值积分,即以时间为横坐标、数值为纵坐标绘制的折线图中折线以下的面积。 @@ -831,6 +899,12 @@ $$\frac{1}{2\times 60}[(1+2) \times 1 + (2+3) \times 1 + (5+6) \times 1 + (6+7) ### IntegralAvg +#### 注册语句 + +```sql +create function integralavg as 'org.apache.iotdb.library.dprofile.UDAFIntegralAvg' +``` + #### 函数简介 本函数用于计算时间序列的函数均值,即在相同时间单位下的数值积分除以序列总的时间跨度。更多关于数值积分计算的信息请参考`Integral`函数。 @@ -891,6 +965,12 @@ $$\frac{1}{2}[(1+2)\times 1 + (2+5) \times 1 + (5+6) \times 1 + (6+7) \times 1 + ### Mad +#### 注册语句 + +```sql +create function mad as 'org.apache.iotdb.library.dprofile.UDAFMad' +``` + #### 函数简介 本函数用于计算单列数值型数据的精确或近似绝对中位差,绝对中位差为所有数值与其中位数绝对偏移量的中位数。 @@ -990,6 +1070,12 @@ select mad(s0, "error"="0.01") from root.test ### Median +#### 注册语句 + +```sql +create function median as 'org.apache.iotdb.library.dprofile.UDAFMedian' +``` + #### 函数简介 本函数用于计算单列数值型数据的精确或近似中位数。中位数是顺序排列的一组数据中居于中间位置的数;当序列有偶数个时,中位数为中间二者的平均数。 @@ -1060,6 +1146,12 @@ select median(s0, "error"="0.01") from root.test ### MinMax +#### 注册语句 + +```sql +create function minmax as 'org.apache.iotdb.library.dprofile.UDTFMinMax' +``` + #### 函数简介 本函数将输入序列使用 min-max 方法进行标准化。最小值归一至 0,最大值归一至 1. @@ -1201,6 +1293,12 @@ select mode(s2) from root.test.d2 ### MvAvg +#### 注册语句 + +```sql +create function mvavg as 'org.apache.iotdb.library.dprofile.UDTFMvAvg' +``` + #### 函数简介 本函数计算序列的移动平均。 @@ -1283,6 +1381,12 @@ select mvavg(s1, "window"="3") from root.test ### PACF +#### 注册语句 + +```sql +create function pacf as 'org.apache.iotdb.library.dprofile.UDTFPACF' +``` + #### 函数简介 本函数通过求解 Yule-Walker 方程,计算序列的偏自相关系数。对于特殊的输入序列,方程可能没有解,此时输出`NaN`。 @@ -1354,6 +1458,12 @@ select pacf(s1, "lag"="5") from root.test ### Percentile +#### 注册语句 + +```sql +create function percentile as 'org.apache.iotdb.library.dprofile.UDAFPercentile' +``` + #### 函数简介 本函数用于计算单列数值型数据的精确或近似分位数。 @@ -1427,6 +1537,12 @@ select percentile(s0, "rank"="0.2", "error"="0.01") from root.test ### Quantile +#### 注册语句 + +```sql +create function quantile as 'org.apache.iotdb.library.dprofile.UDAFQuantile' +``` + #### 函数简介 本函数用于计算单列数值型数据的近似分位数。本函数基于KLL sketch算法实现。 @@ -1501,6 +1617,12 @@ select quantile(s0, "rank"="0.2", "K"="800") from root.test ### Period +#### 注册语句 + +```sql +create function period as 'org.apache.iotdb.library.dprofile.UDAFPeriod' +``` + #### 函数简介 本函数用于计算单列数值型数据的周期。 @@ -1549,6 +1671,12 @@ select period(s1) from root.test.d3 ### QLB +#### 注册语句 + +```sql +create function qlb as 'org.apache.iotdb.library.dprofile.UDTFQLB' +``` + #### 函数简介 本函数对输入序列计算$Q_{LB} $统计量,并计算对应的p值。p值越小表明序列越有可能为非平稳序列。 @@ -1634,6 +1762,12 @@ select QLB(s1) from root.test.d1 ### Resample +#### 注册语句 + +```sql +create function re_sample as 'org.apache.iotdb.library.dprofile.UDTFResample' +``` + #### 函数简介 本函数对输入序列按照指定的频率进行重采样,包括上采样和下采样。目前,本函数支持的上采样方法包括`NaN`填充法 (NaN)、前值填充法 (FFill)、后值填充法 (BFill) 以及线性插值法 (Linear);本函数支持的下采样方法为分组聚合,聚合方法包括最大值 (Max)、最小值 (Min)、首值 (First)、末值 (Last)、平均值 (Mean)和中位数 (Median)。 @@ -1752,6 +1886,12 @@ select resample(s1,'every'='30m','start'='2021-03-06 15:00:00') from root.test.d ### Sample +#### 注册语句 + +```sql +create function sample as 'org.apache.iotdb.library.dprofile.UDTFSample' +``` + #### 函数简介 本函数对输入序列进行采样,即从输入序列中选取指定数量的数据点并输出。目前,本函数支持三种采样方法:**蓄水池采样法 (reservoir sampling)** 对数据进行随机采样,所有数据点被采样的概率相同;**等距采样法 (isometric sampling)** 按照相等的索引间隔对数据进行采样,**最大三角采样法 (triangle sampling)** 对所有数据会按采样率分桶,每个桶内会计算数据点间三角形面积,并保留面积最大的点,该算法通常用于数据的可视化展示中,采用过程可以保证一些关键的突变点在采用中得到保留,更多抽样算法细节可以阅读论文 [here](http://skemman.is/stream/get/1946/15343/37285/3/SS_MSthesis.pdf)。 @@ -1843,6 +1983,12 @@ select sample(s1,'method'='isometric','k'='5') from root.test.d1 ### Segment +#### 注册语句 + +```sql +create function segment as 'org.apache.iotdb.library.dprofile.UDTFSegment' +``` + #### 函数简介 本函数按照数据的线性变化趋势将数据划分为多个子序列,返回分段直线拟合后的子序列首值或所有拟合值。 @@ -1935,6 +2081,12 @@ select segment(s1,"error"="0.1") from root.test ### Skew +#### 注册语句 + +```sql +create function skew as 'org.apache.iotdb.library.dprofile.UDAFSkew' +``` + #### 函数简介 本函数用于计算单列数值型数据的总体偏度 @@ -1996,6 +2148,12 @@ select skew(s1) from root.test.d1 ### Spline +#### 注册语句 + +```sql +create function spline as 'org.apache.iotdb.library.dprofile.UDTFSpline' +``` + #### 函数简介 本函数提供对原始序列进行三次样条曲线拟合后的插值重采样。 @@ -2203,6 +2361,12 @@ select spline(s1, "points"="151") from root.test ### Spread +#### 注册语句 + +```sql +create function spread as 'org.apache.iotdb.library.dprofile.UDAFSpread' +``` + #### 函数简介 本函数用于计算时间序列的极差,即最大值减去最小值的结果。 @@ -2320,6 +2484,12 @@ select stddev(s1) from root.test.d1 ### ZScore +#### 注册语句 + +```sql +create function zscore as 'org.apache.iotdb.library.dprofile.UDTFZScore' +``` + #### 函数简介 本函数将输入序列使用z-score方法进行归一化。 @@ -2429,6 +2599,12 @@ select zscore(s1) from root.test ### IQR +#### 注册语句 + +```sql +create function iqr as 'org.apache.iotdb.library.anomaly.UDTFIQR' +``` + #### 函数简介 本函数用于检验超出上下四分位数1.5倍IQR的数据分布异常。 @@ -2498,6 +2674,12 @@ select iqr(s1) from root.test ### KSigma +#### 注册语句 + +```sql +create function ksigma as 'org.apache.iotdb.library.anomaly.UDTFKSigma' +``` + #### 函数简介 本函数利用动态 K-Sigma 算法进行异常检测。在一个窗口内,与平均值的差距超过k倍标准差的数据将被视作异常并输出。 @@ -2565,6 +2747,12 @@ select ksigma(s1,"k"="1.0") from root.test.d1 where time <= 2020-01-01 00:00:30 ### LOF +#### 注册语句 + +```sql +create function LOF as 'org.apache.iotdb.library.anomaly.UDTFLOF' +``` + #### 函数简介 本函数使用局部离群点检测方法用于查找序列的密度异常。将根据提供的第k距离数及局部离群点因子(lof)阈值,判断输入数据是否为离群点,即异常,并输出各点的 LOF 值。 @@ -2694,6 +2882,12 @@ select lof(s1, "method"="series") from root.test.d1 where time<1000 ### MissDetect +#### 注册语句 + +```sql +create function missdetect as 'org.apache.iotdb.library.anomaly.UDTFMissDetect' +``` + #### 函数简介 本函数用于检测数据中的缺失异常。在一些数据中,缺失数据会被线性插值填补,在数据中出现完美的线性片段,且这些片段往往长度较大。本函数通过在数据中发现这些完美线性片段来检测缺失异常。 @@ -2781,6 +2975,12 @@ select missdetect(s2,'minlen'='10') from root.test.d2 ### Range +#### 注册语句 + +```sql +create function range as 'org.apache.iotdb.library.anomaly.UDTFRange' +``` + #### 函数简介 本函数用于查找时间序列的范围异常。将根据提供的上界与下界,判断输入数据是否越界,即异常,并输出所有异常点为新的时间序列。 @@ -2847,6 +3047,12 @@ select range(s1,"lower_bound"="101.0","upper_bound"="125.0") from root.test.d1 w ### TwoSidedFilter +#### 注册语句 + +```sql +create function twosidedfilter as 'org.apache.iotdb.library.anomaly.UDTFTwoSidedFilter' +``` + #### 函数简介 本函数基于双边窗口检测法对输入序列中的异常点进行过滤。 @@ -2939,6 +3145,12 @@ select TwoSidedFilter(s0, 'len'='5', 'threshold'='0.3') from root.test ### Outlier +#### 注册语句 + +```sql +create function outlier as 'org.apache.iotdb.library.anomaly.UDTFOutlier' +``` + #### 函数简介 本函数用于检测基于距离的异常点。在当前窗口中,如果一个点距离阈值范围内的邻居数量(包括它自己)少于密度阈值,则该点是异常点。 @@ -3267,6 +3479,12 @@ select MasterDetect(lo,la,m_lo,m_la,model,'output_type'='anomaly','p'='3','k'='3 ### Conv +#### 注册语句 + +```sql +create function conv as 'org.apache.iotdb.library.frequency.UDTFConv' +``` + #### 函数简介 本函数对两个输入序列进行卷积,即多项式乘法。 @@ -3315,6 +3533,12 @@ select conv(s1,s2) from root.test.d2 ### Deconv +#### 注册语句 + +```sql +create function deconv as 'org.apache.iotdb.library.frequency.UDTFDeconv' +``` + #### 函数简介 本函数对两个输入序列进行去卷积,即多项式除法运算。 @@ -3392,6 +3616,12 @@ select deconv(s3,s2,'result'='remainder') from root.test.d2 ### DWT +#### 注册语句 + +```sql +create function dwt as 'org.apache.iotdb.library.frequency.UDTFDWT' +``` + #### 函数简介 本函数对输入序列进行一维离散小波变换。 @@ -3473,6 +3703,12 @@ select dwt(s1,"method"="haar") from root.test.d1 ### FFT +#### 注册语句 + +```sql +create function fft as 'org.apache.iotdb.library.frequency.UDTFFFT' +``` + #### 函数简介 本函数对输入序列进行快速傅里叶变换。 @@ -3593,6 +3829,12 @@ select fft(s1, 'result'='real', 'compress'='0.99'), fft(s1, 'result'='imag','com ### HighPass +#### 注册语句 + +```sql +create function highpass as 'org.apache.iotdb.library.frequency.UDTFHighPass' +``` + #### 函数简介 本函数对输入序列进行高通滤波,提取高于截止频率的分量。输入序列的时间戳将被忽略,所有数据点都将被视作等距的。 @@ -3681,6 +3923,12 @@ select highpass(s1,'wpass'='0.45') from root.test.d1 ### IFFT +#### 注册语句 + +```sql +create function ifft as 'org.apache.iotdb.library.frequency.UDTFIFFT' +``` + #### 函数简介 本函数将输入的两个序列作为实部和虚部视作一个复数,进行逆快速傅里叶变换,并输出结果的实部。输入数据的格式参见`FFT`函数的输出,并支持以`FFT`函数压缩后的输出作为本函数的输入。 @@ -3756,6 +4004,12 @@ select ifft(re, im, 'interval'='1m', 'start'='2021-01-01 00:00:00') from root.te ### LowPass +#### 注册语句 + +```sql +create function lowpass as 'org.apache.iotdb.library.frequency.UDTFLowPass' +``` + #### 函数简介 本函数对输入序列进行低通滤波,提取低于截止频率的分量。输入序列的时间戳将被忽略,所有数据点都将被视作等距的。 @@ -3866,6 +4120,12 @@ select lowpass(s1,'wpass'='0.45') from root.test.d1 ### Cov +#### 注册语句 + +```sql +create function cov as 'org.apache.iotdb.library.dmatch.UDAFCov' +``` + #### 函数简介 本函数用于计算两列数值型数据的总体协方差。 @@ -3927,6 +4187,12 @@ select cov(s1,s2) from root.test.d2 ### Dtw +#### 注册语句 + +```sql +create function dtw as 'org.apache.iotdb.library.dmatch.UDAFDtw' +``` + #### 函数简介 本函数用于计算两列数值型数据的 DTW 距离。 @@ -3992,6 +4258,12 @@ select dtw(s1,s2) from root.test.d2 ### Pearson +#### 注册语句 + +```sql +create function pearson as 'org.apache.iotdb.library.dmatch.UDAFPearson' +``` + #### 函数简介 本函数用于计算两列数值型数据的皮尔森相关系数。 @@ -4052,6 +4324,12 @@ select pearson(s1,s2) from root.test.d2 ### PtnSym +#### 注册语句 + +```sql +create function ptnsym as 'org.apache.iotdb.library.dmatch.UDTFPtnSym' +``` + #### 函数简介 本函数用于寻找序列中所有对称度小于阈值的对称子序列。对称度通过 DTW 计算,值越小代表序列对称性越高。 @@ -4110,6 +4388,12 @@ select ptnsym(s4, 'window'='5', 'threshold'='0') from root.test.d1 ### XCorr +#### 注册语句 + +```sql +create function xcorr as 'org.apache.iotdb.library.dmatch.UDTFXCorr' +``` + #### 函数简介 本函数用于计算两条时间序列的互相关函数值, @@ -4200,6 +4484,12 @@ select xcorr(s1, s2) from root.test.d1 where time <= 2020-01-01 00:00:05 ### TimestampRepair +#### 注册语句 + +```sql +create function timestamprepair as 'org.apache.iotdb.library.drepair.UDTFTimestampRepair' +``` + ### 函数简介 本函数用于时间戳修复。根据给定的标准时间间隔,采用最小化修复代价的方法,通过对数据时间戳的微调,将原本时间戳间隔不稳定的数据修复为严格等间隔的数据。在未给定标准时间间隔的情况下,本函数将使用时间间隔的中位数 (median)、众数 (mode) 或聚类中心 (cluster) 来推算标准时间间隔。 @@ -4297,6 +4587,12 @@ select timestamprepair(s1) from root.test.d2 ### ValueFill +#### 注册语句 + +```sql +create function valuefill as 'org.apache.iotdb.library.drepair.UDTFValueFill' +``` + #### 函数简介 **函数名:** ValueFill @@ -4408,6 +4704,12 @@ select valuefill(s1,"method"="previous") from root.test.d2 ### ValueRepair +#### 注册语句 + +```sql +create function valuerepair as 'org.apache.iotdb.library.drepair.UDTFValueRepair' +``` + #### 函数简介 本函数用于对时间序列的数值进行修复。目前,本函数支持两种修复方法:**Screen** 是一种基于速度阈值的方法,在最小改动的前提下使得所有的速度符合阈值要求;**LsGreedy** 是一种基于速度变化似然的方法,将速度变化建模为高斯分布,并采用贪心算法极大化似然函数。 @@ -4711,6 +5013,12 @@ select seasonalrepair(s1,'method'='improved','period'=3) from root.test.d2 ### ConsecutiveSequences +#### 注册语句 + +```sql +create function consecutivesequences as 'org.apache.iotdb.library.series.UDTFConsecutiveSequences' +``` + #### 函数简介 本函数用于在多维严格等间隔数据中发现局部最长连续子序列。 @@ -4799,6 +5107,12 @@ select consecutivesequences(s1,s2) from root.test.d1 ### ConsecutiveWindows +#### 注册语句 + +```sql +create function consecutivewindows as 'org.apache.iotdb.library.series.UDTFConsecutiveWindows' +``` + #### 函数简介 本函数用于在多维严格等间隔数据中发现指定长度的连续窗口。 @@ -4885,6 +5199,11 @@ select consecutivewindows(s1,s2,'length'='10m') from root.test.d1 ### AR +#### 注册语句 + +```sql +create function ar as 'org.apache.iotdb.library.dlearn.UDTFAR' +``` #### 函数简介 本函数用于学习数据的自回归模型系数。 diff --git a/src/zh/UserGuide/latest/Reference/UDF-development.md b/src/zh/UserGuide/latest/Reference/UDF-development.md new file mode 100644 index 000000000..5e0d4ec76 --- /dev/null +++ b/src/zh/UserGuide/latest/Reference/UDF-development.md @@ -0,0 +1,626 @@ +# UDF 开发 + +## UDF 开发 + +### UDF 依赖 + +如果您使用 [Maven](http://search.maven.org/) ,可以从 [Maven 库](http://search.maven.org/) 中搜索下面示例中的依赖。请注意选择和目标 IoTDB 服务器版本相同的依赖版本。 + +``` xml + + org.apache.iotdb + udf-api + 1.0.0 + provided + +``` + +### UDTF(User Defined Timeseries Generating Function) + +编写一个 UDTF 需要继承`org.apache.iotdb.udf.api.UDTF`类,并至少实现`beforeStart`方法和一种`transform`方法。 + +#### 接口说明: + +| 接口定义 | 描述 | 是否必须 | +| :----------------------------------------------------------- | :----------------------------------------------------------- | ------------------ | +| void validate(UDFParameterValidator validator) throws Exception | 在初始化方法`beforeStart`调用前执行,用于检测`UDFParameters`中用户输入的参数是否合法。 | 否 | +| void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception | 初始化方法,在 UDTF 处理输入数据前,调用用户自定义的初始化行为。用户每执行一次 UDTF 查询,框架就会构造一个新的 UDF 类实例,该方法在每个 UDF 类实例被初始化时调用一次。在每一个 UDF 类实例的生命周期内,该方法只会被调用一次。 | 是 | +| void transform(Row row, PointCollector collector) throws Exception | 这个方法由框架调用。当您在`beforeStart`中选择以`RowByRowAccessStrategy`的策略消费原始数据时,这个数据处理方法就会被调用。输入参数以`Row`的形式传入,输出结果通过`PointCollector`输出。您需要在该方法内自行调用`collector`提供的数据收集方法,以决定最终的输出数据。 | 与下面的方法二选一 | +| void transform(RowWindow rowWindow, PointCollector collector) throws Exception | 这个方法由框架调用。当您在`beforeStart`中选择以`SlidingSizeWindowAccessStrategy`或者`SlidingTimeWindowAccessStrategy`的策略消费原始数据时,这个数据处理方法就会被调用。输入参数以`RowWindow`的形式传入,输出结果通过`PointCollector`输出。您需要在该方法内自行调用`collector`提供的数据收集方法,以决定最终的输出数据。 | 与上面的方法二选一 | +| void terminate(PointCollector collector) throws Exception | 这个方法由框架调用。该方法会在所有的`transform`调用执行完成后,在`beforeDestory`方法执行前被调用。在一个 UDF 查询过程中,该方法会且只会调用一次。您需要在该方法内自行调用`collector`提供的数据收集方法,以决定最终的输出数据。 | 否 | +| void beforeDestroy() | UDTF 的结束方法。此方法由框架调用,并且只会被调用一次,即在处理完最后一条记录之后被调用。 | 否 | + +在一个完整的 UDTF 实例生命周期中,各个方法的调用顺序如下: + +1. void validate(UDFParameterValidator validator) throws Exception +2. void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception +3. void transform(Row row, PointCollector collector) throws Exception 或者 void transform(RowWindow rowWindow, PointCollector collector) throws Exception +4. void terminate(PointCollector collector) throws Exception +5. void beforeDestroy() + +> 注意,框架每执行一次 UDTF 查询,都会构造一个全新的 UDF 类实例,查询结束时,对应的 UDF 类实例即被销毁,因此不同 UDTF 查询(即使是在同一个 SQL 语句中)UDF 类实例内部的数据都是隔离的。您可以放心地在 UDTF 中维护一些状态数据,无需考虑并发对 UDF 类实例内部状态数据的影响。 + +#### 接口详细介绍: + +1. **void validate(UDFParameterValidator validator) throws Exception** + + `validate`方法能够对用户输入的参数进行验证。 + + 您可以在该方法中限制输入序列的数量和类型,检查用户输入的属性或者进行自定义逻辑的验证。 + + `UDFParameterValidator`的使用方法请见 Javadoc。 + +2. **void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception** + + `beforeStart`方法有两个作用: + 1. 帮助用户解析 SQL 语句中的 UDF 参数 + 2. 配置 UDF 运行时必要的信息,即指定 UDF 访问原始数据时采取的策略和输出结果序列的类型 + 3. 创建资源,比如建立外部链接,打开文件等 + +2.1 **UDFParameters** + +`UDFParameters`的作用是解析 SQL 语句中的 UDF 参数(SQL 中 UDF 函数名称后括号中的部分)。参数包括序列类型参数和字符串 key-value 对形式输入的属性参数。 + +示例: + +``` sql +SELECT UDF(s1, s2, 'key1'='iotdb', 'key2'='123.45') FROM root.sg.d; +``` + +用法: + +``` java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + String stringValue = parameters.getString("key1"); // iotdb + Float floatValue = parameters.getFloat("key2"); // 123.45 + Double doubleValue = parameters.getDouble("key3"); // null + int intValue = parameters.getIntOrDefault("key4", 678); // 678 + // do something + + // configurations + // ... +} +``` + +2.2 **UDTFConfigurations** + +您必须使用 `UDTFConfigurations` 指定 UDF 访问原始数据时采取的策略和输出结果序列的类型。 + +用法: + +``` java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + // parameters + // ... + + // configurations + configurations + .setAccessStrategy(new RowByRowAccessStrategy()) + .setOutputDataType(Type.INT32); +} +``` + +其中`setAccessStrategy`方法用于设定 UDF 访问原始数据时采取的策略,`setOutputDataType`用于设定输出结果序列的类型。 + + 2.2.1 **setAccessStrategy** + +注意,您在此处设定的原始数据访问策略决定了框架会调用哪一种`transform`方法 ,请实现与原始数据访问策略对应的`transform`方法。当然,您也可以根据`UDFParameters`解析出来的属性参数,动态决定设定哪一种策略,因此,实现两种`transform`方法也是被允许的。 + +下面是您可以设定的访问原始数据的策略: + +| 接口定义 | 描述 | 调用的`transform`方法 | +| ------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| MappableRowByRow | 自定义标量函数
框架会为每一行原始数据输入调用一次`transform`方法,输入 k 列时间序列 1 行数据,输出 1 列时间序列 1 行数据,可用于标量函数出现的任何子句和表达式中,如select子句、where子句等。 | void transform(Column[] columns, ColumnBuilder builder) throws ExceptionObject transform(Row row) throws Exception | +| RowByRowAccessStrategy | 自定义时间序列生成函数,逐行地处理原始数据。
框架会为每一行原始数据输入调用一次`transform`方法,输入 k 列时间序列 1 行数据,输出 1 列时间序列 n 行数据。
当输入一个序列时,该行就作为输入序列的一个数据点。
当输入多个序列时,输入序列按时间对齐后,每一行作为的输入序列的一个数据点。
(一行数据中,可能存在某一列为`null`值,但不会全部都是`null`) | void transform(Row row, PointCollector collector) throws Exception | +| SlidingTimeWindowAccessStrategy | 自定义时间序列生成函数,以滑动时间窗口的方式处理原始数据。
框架会为每一个原始数据输入窗口调用一次`transform`方法,输入 k 列时间序列 m 行数据,输出 1 列时间序列 n 行数据。
一个窗口可能存在多行数据,输入序列按时间对齐后,每个窗口作为的输入序列的一个数据点。
(每个窗口可能存在 i 行,每行数据可能存在某一列为`null`值,但不会全部都是`null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| SlidingSizeWindowAccessStrategy | 自定义时间序列生成函数,以固定行数的方式处理原始数据,即每个数据处理窗口都会包含固定行数的数据(最后一个窗口除外)。
框架会为每一个原始数据输入窗口调用一次`transform`方法,输入 k 列时间序列 m 行数据,输出 1 列时间序列 n 行数据。
一个窗口可能存在多行数据,输入序列按时间对齐后,每个窗口作为的输入序列的一个数据点。
(每个窗口可能存在 i 行,每行数据可能存在某一列为`null`值,但不会全部都是`null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| SessionTimeWindowAccessStrategy | 自定义时间序列生成函数,以会话窗口的方式处理原始数据。
框架会为每一个原始数据输入窗口调用一次`transform`方法,输入 k 列时间序列 m 行数据,输出 1 列时间序列 n 行数据。
一个窗口可能存在多行数据,输入序列按时间对齐后,每个窗口作为的输入序列的一个数据点。
(每个窗口可能存在 i 行,每行数据可能存在某一列为`null`值,但不会全部都是`null`) | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | +| StateWindowAccessStrategy | 自定义时间序列生成函数,以状态窗口的方式处理原始数据。
框架会为每一个原始数据输入窗口调用一次`transform`方法,输入 1 列时间序列 m 行数据,输出 1 列时间序列 n 行数据。
一个窗口可能存在多行数据,目前仅支持对一个物理量也就是一列数据进行开窗。 | void transform(RowWindow rowWindow, PointCollector collector) throws Exception | + +#### 接口详情: + +- `RowByRowAccessStrategy`的构造不需要任何参数。 + +- `SlidingTimeWindowAccessStrategy` + +开窗示意图: + + + +`SlidingTimeWindowAccessStrategy`有多种构造方法,您可以向构造方法提供 3 类参数: + +1. 时间轴显示时间窗开始和结束时间 + +时间轴显示时间窗开始和结束时间不是必须要提供的。当您不提供这类参数时,时间轴显示时间窗开始时间会被定义为整个查询结果集中最小的时间戳,时间轴显示时间窗结束时间会被定义为整个查询结果集中最大的时间戳。 + +2. 划分时间轴的时间间隔参数(必须为正数) +3. 滑动步长(不要求大于等于时间间隔,但是必须为正数) + +滑动步长参数也不是必须的。当您不提供滑动步长参数时,滑动步长会被设定为划分时间轴的时间间隔。 + +3 类参数的关系可见下图。策略的构造方法详见 Javadoc。 + + + +> 注意,最后的一些时间窗口的实际时间间隔可能小于规定的时间间隔参数。另外,可能存在某些时间窗口内数据行数量为 0 的情况,这种情况框架也会为该窗口调用一次`transform`方法。 + +- `SlidingSizeWindowAccessStrategy` + +开窗示意图: + + + +`SlidingSizeWindowAccessStrategy`有多种构造方法,您可以向构造方法提供 2 个参数: + +1. 窗口大小,即一个数据处理窗口包含的数据行数。注意,最后一些窗口的数据行数可能少于规定的数据行数。 +2. 滑动步长,即下一窗口第一个数据行与当前窗口第一个数据行间的数据行数(不要求大于等于窗口大小,但是必须为正数) + +滑动步长参数不是必须的。当您不提供滑动步长参数时,滑动步长会被设定为窗口大小。 + +- `SessionTimeWindowAccessStrategy` + +开窗示意图:**时间间隔小于等于给定的最小时间间隔 sessionGap 则分为一组。** + + + + +`SessionTimeWindowAccessStrategy`有多种构造方法,您可以向构造方法提供 2 类参数: + +1. 时间轴显示时间窗开始和结束时间。 +2. 会话窗口之间的最小时间间隔。 + +- `StateWindowAccessStrategy` + +开窗示意图:**对于数值型数据,状态差值小于等于给定的阈值 delta 则分为一组。** + + + +`StateWindowAccessStrategy`有四种构造方法: + +1. 针对数值型数据,可以提供时间轴显示时间窗开始和结束时间以及对于单个窗口内部允许变化的阈值delta。 +2. 针对文本数据以及布尔数据,可以提供时间轴显示时间窗开始和结束时间。对于这两种数据类型,单个窗口内的数据是相同的,不需要提供变化阈值。 +3. 针对数值型数据,可以只提供单个窗口内部允许变化的阈值delta,时间轴显示时间窗开始时间会被定义为整个查询结果集中最小的时间戳,时间轴显示时间窗结束时间会被定义为整个查询结果集中最大的时间戳。 +4. 针对文本数据以及布尔数据,可以不提供任何参数,开始与结束时间戳见3中解释。 + +StateWindowAccessStrategy 目前只能接收一列输入。策略的构造方法详见 Javadoc。 + + 2.2.2 **setOutputDataType** + +注意,您在此处设定的输出结果序列的类型,决定了`transform`方法中`PointCollector`实际能够接收的数据类型。`setOutputDataType`中设定的输出类型和`PointCollector`实际能够接收的数据输出类型关系如下: + +| `setOutputDataType`中设定的输出类型 | `PointCollector`实际能够接收的输出类型 | +| :---------------------------------- | :----------------------------------------------------------- | +| INT32 | int | +| INT64 | long | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | boolean | +| TEXT | java.lang.String 和 org.apache.iotdb.udf.api.type.Binary | + +UDTF 输出序列的类型是运行时决定的。您可以根据输入序列类型动态决定输出序列类型。 + +示例: + +```java +void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { + // do something + // ... + + configurations + .setAccessStrategy(new RowByRowAccessStrategy()) + .setOutputDataType(parameters.getDataType(0)); +} +``` + +3. **void transform(Row row, PointCollector collector) throws Exception** + +当您在`beforeStart`方法中指定 UDF 读取原始数据的策略为 `RowByRowAccessStrategy`,您就需要实现该方法,在该方法中增加对原始数据处理的逻辑。 + +该方法每次处理原始数据的一行。原始数据由`Row`读入,由`PointCollector`输出。您可以选择在一次`transform`方法调用中输出任意数量的数据点。需要注意的是,输出数据点的类型必须与您在`beforeStart`方法中设置的一致,而输出数据点的时间戳必须是严格单调递增的。 + +下面是一个实现了`void transform(Row row, PointCollector collector) throws Exception`方法的完整 UDF 示例。它是一个加法器,接收两列时间序列输入,当这两个数据点都不为`null`时,输出这两个数据点的代数和。 + +``` java +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Adder implements UDTF { + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(Type.INT64) + .setAccessStrategy(new RowByRowAccessStrategy()); + } + + @Override + public void transform(Row row, PointCollector collector) throws Exception { + if (row.isNull(0) || row.isNull(1)) { + return; + } + collector.putLong(row.getTime(), row.getLong(0) + row.getLong(1)); + } +} +``` + +4. **void transform(RowWindow rowWindow, PointCollector collector) throws Exception** + +当您在`beforeStart`方法中指定 UDF 读取原始数据的策略为 `SlidingTimeWindowAccessStrategy`或者`SlidingSizeWindowAccessStrategy`时,您就需要实现该方法,在该方法中增加对原始数据处理的逻辑。 + +该方法每次处理固定行数或者固定时间间隔内的一批数据,我们称包含这一批数据的容器为窗口。原始数据由`RowWindow`读入,由`PointCollector`输出。`RowWindow`能够帮助您访问某一批次的`Row`,它提供了对这一批次的`Row`进行随机访问和迭代访问的接口。您可以选择在一次`transform`方法调用中输出任意数量的数据点,需要注意的是,输出数据点的类型必须与您在`beforeStart`方法中设置的一致,而输出数据点的时间戳必须是严格单调递增的。 + +下面是一个实现了`void transform(RowWindow rowWindow, PointCollector collector) throws Exception`方法的完整 UDF 示例。它是一个计数器,接收任意列数的时间序列输入,作用是统计并输出指定时间范围内每一个时间窗口中的数据行数。 + +```java +import java.io.IOException; +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.RowWindow; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.SlidingTimeWindowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Counter implements UDTF { + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(Type.INT32) + .setAccessStrategy(new SlidingTimeWindowAccessStrategy( + parameters.getLong("time_interval"), + parameters.getLong("sliding_step"), + parameters.getLong("display_window_begin"), + parameters.getLong("display_window_end"))); + } + + @Override + public void transform(RowWindow rowWindow, PointCollector collector) throws Exception { + if (rowWindow.windowSize() != 0) { + collector.putInt(rowWindow.windowStartTime(), rowWindow.windowSize()); + } + } +} +``` + +5. **void terminate(PointCollector collector) throws Exception** + +在一些场景下,UDF 需要遍历完所有的原始数据后才能得到最后的输出结果。`terminate`接口为这类 UDF 提供了支持。 + +该方法会在所有的`transform`调用执行完成后,在`beforeDestory`方法执行前被调用。您可以选择使用`transform`方法进行单纯的数据处理,最后使用`terminate`将处理结果输出。 + +结果需要由`PointCollector`输出。您可以选择在一次`terminate`方法调用中输出任意数量的数据点。需要注意的是,输出数据点的类型必须与您在`beforeStart`方法中设置的一致,而输出数据点的时间戳必须是严格单调递增的。 + +下面是一个实现了`void terminate(PointCollector collector) throws Exception`方法的完整 UDF 示例。它接收一个`INT32`类型的时间序列输入,作用是输出该序列的最大值点。 + +```java +import java.io.IOException; +import org.apache.iotdb.udf.api.UDTF; +import org.apache.iotdb.udf.api.access.Row; +import org.apache.iotdb.udf.api.collector.PointCollector; +import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; +import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; +import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; +import org.apache.iotdb.udf.api.type.Type; + +public class Max implements UDTF { + + private Long time; + private int value; + + @Override + public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { + configurations + .setOutputDataType(TSDataType.INT32) + .setAccessStrategy(new RowByRowAccessStrategy()); + } + + @Override + public void transform(Row row, PointCollector collector) { + if (row.isNull(0)) { + return; + } + int candidateValue = row.getInt(0); + if (time == null || value < candidateValue) { + time = row.getTime(); + value = candidateValue; + } + } + + @Override + public void terminate(PointCollector collector) throws IOException { + if (time != null) { + collector.putInt(time, value); + } + } +} +``` + +6. **void beforeDestroy()** + +UDTF 的结束方法,您可以在此方法中进行一些资源释放等的操作。 + +此方法由框架调用。对于一个 UDF 类实例而言,生命周期中会且只会被调用一次,即在处理完最后一条记录之后被调用。 + +### UDAF(User Defined Aggregation Function) + +一个完整的 UDAF 定义涉及到 State 和 UDAF 两个类。 + +#### State 类 + +编写一个 State 类需要实现`org.apache.iotdb.udf.api.State`接口,下表是需要实现的方法说明。 + +#### 接口说明: + +| 接口定义 | 描述 | 是否必须 | +| -------------------------------- | ------------------------------------------------------------ | -------- | +| void reset() | 将 `State` 对象重置为初始的状态,您需要像编写构造函数一样,在该方法内填入 `State` 类中各个字段的初始值。 | 是 | +| byte[] serialize() | 将 `State` 序列化为二进制数据。该方法用于 IoTDB 内部的 `State` 对象传递,注意序列化的顺序必须和下面的反序列化方法一致。 | 是 | +| void deserialize(byte[] bytes) | 将二进制数据反序列化为 `State`。该方法用于 IoTDB 内部的 `State` 对象传递,注意反序列化的顺序必须和上面的序列化方法一致。 | 是 | + +#### 接口详细介绍: + +1. **void reset()** + +该方法的作用是将 `State` 重置为初始的状态,您需要在该方法内填写 `State` 对象中各个字段的初始值。出于优化上的考量,IoTDB 在内部会尽可能地复用 `State`,而不是为每一个组创建一个新的 `State`,这样会引入不必要的开销。当 `State` 更新完一个组中的数据之后,就会调用这个方法重置为初始状态,以此来处理下一个组。 + +以求平均数(也就是 `avg`)的 `State` 为例,您需要数据的总和 `sum` 与数据的条数 `count`,并在 `reset()` 方法中将二者初始化为 0。 + +```java +class AvgState implements State { + double sum; + + long count; + + @Override + public void reset() { + sum = 0; + count = 0; + } + + // other methods +} +``` + +2. **byte[] serialize()/void deserialize(byte[] bytes)** + +该方法的作用是将 State 序列化为二进制数据,和从二进制数据中反序列化出 State。IoTDB 作为分布式数据库,涉及到在不同节点中传递数据,因此您需要编写这两个方法,来实现 State 在不同节点中的传递。注意序列化和反序列的顺序必须一致。 + +还是以求平均数(也就是求 avg)的 State 为例,您可以通过任意途径将 State 的内容转化为 `byte[]` 数组,以及从 `byte[]` 数组中读取出 State 的内容,下面展示的是用 Java8 引入的 `ByteBuffer` 进行序列化/反序列的代码: + +```java +@Override +public byte[] serialize() { + ByteBuffer buffer = ByteBuffer.allocate(Double.BYTES + Long.BYTES); + buffer.putDouble(sum); + buffer.putLong(count); + + return buffer.array(); +} + +@Override +public void deserialize(byte[] bytes) { + ByteBuffer buffer = ByteBuffer.wrap(bytes); + sum = buffer.getDouble(); + count = buffer.getLong(); +} +``` + +#### UDAF 类 + +编写一个 UDAF 类需要实现`org.apache.iotdb.udf.api.UDAF`接口,下表是需要实现的方法说明。 + +#### 接口说明: + +| 接口定义 | 描述 | 是否必须 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | -------- | +| void validate(UDFParameterValidator validator) throws Exception | 在初始化方法`beforeStart`调用前执行,用于检测`UDFParameters`中用户输入的参数是否合法。该方法与 UDTF 的`validate`相同。 | 否 | +| void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception | 初始化方法,在 UDAF 处理输入数据前,调用用户自定义的初始化行为。与 UDTF 不同的是,这里的 configuration 是 `UDAFConfiguration` 类型。 | 是 | +| State createState() | 创建`State`对象,一般只需要调用默认构造函数,然后按需修改默认的初始值即可。 | 是 | +| void addInput(State state, Column[] columns, BitMap bitMap) | 根据传入的数据`Column[]`批量地更新`State`对象,注意最后一列,也就是 `columns[columns.length - 1]` 总是代表时间列。另外`BitMap`表示之前已经被过滤掉的数据,您在编写该方法时需要手动判断对应的数据是否被过滤掉。 | 是 | +| void combineState(State state, State rhs) | 将`rhs`状态合并至`state`状态中。在分布式场景下,同一组的数据可能分布在不同节点上,IoTDB 会为每个节点上的部分数据生成一个`State`对象,然后调用该方法合并成完整的`State`。 | 是 | +| void outputFinal(State state, ResultValue resultValue) | 根据`State`中的数据,计算出最终的聚合结果。注意根据聚合的语义,每一组只能输出一个值。 | 是 | +| void beforeDestroy() | UDAF 的结束方法。此方法由框架调用,并且只会被调用一次,即在处理完最后一条记录之后被调用。 | 否 | + +在一个完整的 UDAF 实例生命周期中,各个方法的调用顺序如下: + +1. State createState() +2. void validate(UDFParameterValidator validator) throws Exception +3. void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception +4. void addInput(State state, Column[] columns, BitMap bitMap) +5. void combineState(State state, State rhs) +6. void outputFinal(State state, ResultValue resultValue) +7. void beforeDestroy() + +和 UDTF 类似,框架每执行一次 UDAF 查询,都会构造一个全新的 UDF 类实例,查询结束时,对应的 UDF 类实例即被销毁,因此不同 UDAF 查询(即使是在同一个 SQL 语句中)UDF 类实例内部的数据都是隔离的。您可以放心地在 UDAF 中维护一些状态数据,无需考虑并发对 UDF 类实例内部状态数据的影响。 + +#### 接口详细介绍: + +1. **void validate(UDFParameterValidator validator) throws Exception** + +同 UDTF, `validate`方法能够对用户输入的参数进行验证。 + +您可以在该方法中限制输入序列的数量和类型,检查用户输入的属性或者进行自定义逻辑的验证。 + +2. **void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception** + + `beforeStart`方法的作用 UDAF 相同: + + 1. 帮助用户解析 SQL 语句中的 UDF 参数 + 2. 配置 UDF 运行时必要的信息,即指定 UDF 访问原始数据时采取的策略和输出结果序列的类型 + 3. 创建资源,比如建立外部链接,打开文件等。 + +其中,`UDFParameters` 类型的作用可以参照上文。 + +2.2 **UDTFConfigurations** + +和 UDTF 的区别在于,UDAF 使用了 `UDAFConfigurations` 作为 `configuration` 对象的类型。 + +目前,该类仅支持设置输出数据的类型。 + +```java +void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { + // parameters + // ... + + // configurations + configurations + .setOutputDataType(Type.INT32); +} +``` + +`setOutputDataType` 中设定的输出类型和 `ResultValue` 实际能够接收的数据输出类型关系如下: + +| `setOutputDataType`中设定的输出类型 | `ResultValue`实际能够接收的输出类型 | +| :---------------------------------- | :------------------------------------- | +| INT32 | int | +| INT64 | long | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | boolean | +| TEXT | org.apache.iotdb.udf.api.type.Binary | + +UDAF 输出序列的类型也是运行时决定的。您可以根据输入序列类型动态决定输出序列类型。 + +示例: + +```java +void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { + // do something + // ... + + configurations + .setOutputDataType(parameters.getDataType(0)); +} +``` + +3. **State createState()** + +为 UDAF 创建并初始化 `State`。由于 Java 语言本身的限制,您只能调用 `State` 类的默认构造函数。默认构造函数会为类中所有的字段赋一个默认的初始值,如果该初始值并不符合您的要求,您需要在这个方法内进行手动的初始化。 + +下面是一个包含手动初始化的例子。假设您要实现一个累乘的聚合函数,`State` 的初始值应该设置为 1,但是默认构造函数会初始化为 0,因此您需要在调用默认构造函数之后,手动对 `State` 进行初始化: + +```java +public State createState() { + MultiplyState state = new MultiplyState(); + state.result = 1; + return state; +} +``` + +4. **void addInput(State state, Column[] columns, BitMap bitMap)** + +该方法的作用是,通过原始的输入数据来更新 `State` 对象。出于性能上的考量,也是为了和 IoTDB 向量化的查询引擎相对齐,原始的输入数据不再是一个数据点,而是列的数组 `Column[]`。注意最后一列(也就是 `columns[columns.length - 1]` )总是时间列,因此您也可以在 UDAF 中根据时间进行不同的操作。 + +由于输入参数的类型不是一个数据点,而是多个列,您需要手动对列中的部分数据进行过滤处理,这就是第三个参数 `BitMap` 存在的意义。它用来标识这些列中哪些数据被过滤掉了,您在任何情况下都无需考虑被过滤掉的数据。 + +下面是一个用于统计数据条数(也就是 count)的 `addInput()` 示例。它展示了您应该如何使用 `BitMap` 来忽视那些已经被过滤掉的数据。注意还是由于 Java 语言本身的限制,您需要在方法的开头将接口中定义的 `State` 类型强制转化为自定义的 `State` 类型,不然后续无法正常使用该 `State` 对象。 + +```java +public void addInput(State state, Column[] columns, BitMap bitMap) { + CountState countState = (CountState) state; + + int count = columns[0].getPositionCount(); + for (int i = 0; i < count; i++) { + if (bitMap != null && !bitMap.isMarked(i)) { + continue; + } + if (!columns[0].isNull(i)) { + countState.count++; + } + } +} +``` + +5. **void combineState(State state, State rhs)** + +该方法的作用是合并两个 `State`,更加准确的说,是用第二个 `State` 对象来更新第一个 `State` 对象。IoTDB 是分布式数据库,同一组的数据可能分布在多个不同的节点上。出于性能考虑,IoTDB 会为每个节点上的部分数据先进行聚合成 `State`,然后再将不同节点上的、属于同一个组的 `State` 进行合并,这就是 `combineState` 的作用。 + +下面是一个用于求平均数(也就是 avg)的 `combineState()` 示例。和 `addInput` 类似,您都需要在开头对两个 `State` 进行强制类型转换。另外需要注意是用第二个 `State` 的内容来更新第一个 `State` 的值。 + +```java +public void combineState(State state, State rhs) { + AvgState avgState = (AvgState) state; + AvgState avgRhs = (AvgState) rhs; + + avgState.count += avgRhs.count; + avgState.sum += avgRhs.sum; +} +``` + +6. **void outputFinal(State state, ResultValue resultValue)** + +该方法的作用是从 `State` 中计算出最终的结果。您需要访问 `State` 中的各个字段,求出最终的结果,并将最终的结果设置到 `ResultValue` 对象中。IoTDB 内部会为每个组在最后调用一次这个方法。注意根据聚合的语义,最终的结果只能是一个值。 + +下面还是一个用于求平均数(也就是 avg)的 `outputFinal` 示例。除了开头的强制类型转换之外,您还将看到 `ResultValue` 对象的具体用法,即通过 `setXXX`(其中 `XXX` 是类型名)来设置最后的结果。 + +```java +public void outputFinal(State state, ResultValue resultValue) { + AvgState avgState = (AvgState) state; + + if (avgState.count != 0) { + resultValue.setDouble(avgState.sum / avgState.count); + } else { + resultValue.setNull(); + } +} +``` + +7. **void beforeDestroy()** + +UDAF 的结束方法,您可以在此方法中进行一些资源释放等的操作。 + +此方法由框架调用。对于一个 UDF 类实例而言,生命周期中会且只会被调用一次,即在处理完最后一条记录之后被调用。 + +### 完整 Maven 项目示例 + +如果您使用 [Maven](http://search.maven.org/),可以参考我们编写的示例项目**udf-example**。您可以在 [这里](https://github.com/apache/iotdb/tree/master/example/udf) 找到它。 + + +## 为iotdb贡献通用的内置UDF函数 + +该部分主要讲述了外部用户如何将自己编写的 UDF 贡献给 IoTDB 社区。 + +#### 前提条件 + +1. UDF 具有通用性。 + + 通用性主要指的是:UDF 在某些业务场景下,可以被广泛使用。换言之,就是 UDF 具有复用价值,可被社区内其他用户直接使用。 + + 如果不确定自己写的 UDF 是否具有通用性,可以发邮件到 `dev@iotdb.apache.org` 或直接创建 ISSUE 发起讨论。 + +2. UDF 已经完成测试,且能够正常运行在用户的生产环境中。 + +#### 贡献清单 + +1. UDF 的源代码 +2. UDF 的测试用例 +3. UDF 的使用说明 + +#### 源代码 + +1. 在`iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin`中创建 UDF 主类和相关的辅助类。 +2. 在`iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`中注册编写的 UDF。 + +#### 测试用例 + +至少需要为贡献的 UDF 编写集成测试。 + +可以在`integration-test/src/test/java/org/apache/iotdb/db/it/udf`中为贡献的 UDF 新增一个测试类进行测试。 + +#### 使用说明 + +使用说明需要包含:UDF 的名称、UDF 的作用、执行函数必须的属性参数、函数的适用的场景以及使用示例等。 + +使用说明需包含中英文两个版本。应分别在 `docs/zh/UserGuide/Operation Manual/DML Data Manipulation Language.md` 和 `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md` 中新增使用说明。 + +#### 提交 PR + +当准备好源代码、测试用例和使用说明后,就可以将 UDF 贡献到 IoTDB 社区了。在 [Github](https://github.com/apache/iotdb) 上面提交 Pull Request (PR) 即可。具体提交方式见:[贡献指南](https://iotdb.apache.org/zh/Community/Development-Guide.html)。 + +当 PR 评审通过并被合并后, UDF 就已经贡献给 IoTDB 社区了! \ No newline at end of file diff --git a/src/zh/UserGuide/latest/User-Manual/Database-Programming.md b/src/zh/UserGuide/latest/User-Manual/Database-Programming.md index 385319086..86b0c31a8 100644 --- a/src/zh/UserGuide/latest/User-Manual/Database-Programming.md +++ b/src/zh/UserGuide/latest/User-Manual/Database-Programming.md @@ -1030,812 +1030,3 @@ SELECT avg(count_s1) from root.sg_count.d; | :---------------------------------- |----------------------|----------|---------------| | `continuous_query_submit_thread` | 用于周期性提交连续查询执行任务的线程数 | int32 | 2 | | `continuous_query_min_every_interval_in_ms` | 系统允许的连续查询最小的周期性时间间隔 | duration | 1000 | - -## 用户自定义函数 - -UDF(User Defined Function)即用户自定义函数。IoTDB 提供多种内建函数来满足您的计算需求,同时您还可以通过创建自定义函数来满足更多的计算需求。 - -根据此文档,您将会很快学会 UDF 的编写、注册、使用等操作。 - -### UDF 类型 - -IoTDB 支持两种类型的 UDF 函数,如下表所示。 - -| UDF 分类 | 描述 | -| --------------------------------------------------- | ------------------------------------------------------------ | -| UDTF(User Defined Timeseries Generating Function) | 自定义时间序列生成函数。该类函数允许接收多条时间序列,最终会输出一条时间序列,生成的时间序列可以有任意多数量的数据点。 | -| UDAF(User Defined Aggregation Function) | 自定义聚合函数。该类函数接受一条时间序列数据,最终会根据用户指定的 GROUP BY 类型,为每个组生成一个聚合后的数据点。 | - -### UDF 依赖 - -如果您使用 [Maven](http://search.maven.org/) ,可以从 [Maven 库](http://search.maven.org/) 中搜索下面示例中的依赖。请注意选择和目标 IoTDB 服务器版本相同的依赖版本。 - -``` xml - - org.apache.iotdb - udf-api - 1.0.0 - provided - -``` - -### UDTF(User Defined Timeseries Generating Function) - -编写一个 UDTF 需要继承`org.apache.iotdb.udf.api.UDTF`类,并至少实现`beforeStart`方法和一种`transform`方法。 - -下表是所有可供用户实现的接口说明。 - -| 接口定义 | 描述 | 是否必须 | -| :----------------------------------------------------------- | :----------------------------------------------------------- | ------------------ | -| `void validate(UDFParameterValidator validator) throws Exception` | 在初始化方法`beforeStart`调用前执行,用于检测`UDFParameters`中用户输入的参数是否合法。 | 否 | -| `void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception` | 初始化方法,在 UDTF 处理输入数据前,调用用户自定义的初始化行为。用户每执行一次 UDTF 查询,框架就会构造一个新的 UDF 类实例,该方法在每个 UDF 类实例被初始化时调用一次。在每一个 UDF 类实例的生命周期内,该方法只会被调用一次。 | 是 | -| `void transform(Row row, PointCollector collector) throws Exception` | 这个方法由框架调用。当您在`beforeStart`中选择以`RowByRowAccessStrategy`的策略消费原始数据时,这个数据处理方法就会被调用。输入参数以`Row`的形式传入,输出结果通过`PointCollector`输出。您需要在该方法内自行调用`collector`提供的数据收集方法,以决定最终的输出数据。 | 与下面的方法二选一 | -| `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | 这个方法由框架调用。当您在`beforeStart`中选择以`SlidingSizeWindowAccessStrategy`或者`SlidingTimeWindowAccessStrategy`的策略消费原始数据时,这个数据处理方法就会被调用。输入参数以`RowWindow`的形式传入,输出结果通过`PointCollector`输出。您需要在该方法内自行调用`collector`提供的数据收集方法,以决定最终的输出数据。 | 与上面的方法二选一 | -| `void terminate(PointCollector collector) throws Exception` | 这个方法由框架调用。该方法会在所有的`transform`调用执行完成后,在`beforeDestory`方法执行前被调用。在一个 UDF 查询过程中,该方法会且只会调用一次。您需要在该方法内自行调用`collector`提供的数据收集方法,以决定最终的输出数据。 | 否 | -| `void beforeDestroy() ` | UDTF 的结束方法。此方法由框架调用,并且只会被调用一次,即在处理完最后一条记录之后被调用。 | 否 | - -在一个完整的 UDTF 实例生命周期中,各个方法的调用顺序如下: - -1. `void validate(UDFParameterValidator validator) throws Exception` -2. `void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception` -3. `void transform(Row row, PointCollector collector) throws Exception`或者`void transform(RowWindow rowWindow, PointCollector collector) throws Exception` -4. `void terminate(PointCollector collector) throws Exception` -5. `void beforeDestroy() ` - -注意,框架每执行一次 UDTF 查询,都会构造一个全新的 UDF 类实例,查询结束时,对应的 UDF 类实例即被销毁,因此不同 UDTF 查询(即使是在同一个 SQL 语句中)UDF 类实例内部的数据都是隔离的。您可以放心地在 UDTF 中维护一些状态数据,无需考虑并发对 UDF 类实例内部状态数据的影响。 - -下面将详细介绍各个接口的使用方法。 - - * void validate(UDFParameterValidator validator) throws Exception - - `validate`方法能够对用户输入的参数进行验证。 - - 您可以在该方法中限制输入序列的数量和类型,检查用户输入的属性或者进行自定义逻辑的验证。 - - `UDFParameterValidator`的使用方法请见 Javadoc。 - - * void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception - - `beforeStart`方法有两个作用: - - 1. 帮助用户解析 SQL 语句中的 UDF 参数 - 2. 配置 UDF 运行时必要的信息,即指定 UDF 访问原始数据时采取的策略和输出结果序列的类型 - 3. 创建资源,比如建立外部链接,打开文件等。 - -#### UDFParameters - -`UDFParameters`的作用是解析 SQL 语句中的 UDF 参数(SQL 中 UDF 函数名称后括号中的部分)。参数包括序列类型参数和字符串 key-value 对形式输入的属性参数。 - -例子: - -``` sql -SELECT UDF(s1, s2, 'key1'='iotdb', 'key2'='123.45') FROM root.sg.d; -``` - -用法: - -``` java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - String stringValue = parameters.getString("key1"); // iotdb - Float floatValue = parameters.getFloat("key2"); // 123.45 - Double doubleValue = parameters.getDouble("key3"); // null - int intValue = parameters.getIntOrDefault("key4", 678); // 678 - // do something - - // configurations - // ... -} -``` - -#### UDTFConfigurations - -您必须使用 `UDTFConfigurations` 指定 UDF 访问原始数据时采取的策略和输出结果序列的类型。 - -用法: - -``` java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - // parameters - // ... - - // configurations - configurations - .setAccessStrategy(new RowByRowAccessStrategy()) - .setOutputDataType(Type.INT32); -} -``` - -其中`setAccessStrategy`方法用于设定 UDF 访问原始数据时采取的策略,`setOutputDataType`用于设定输出结果序列的类型。 - - * setAccessStrategy - -注意,您在此处设定的原始数据访问策略决定了框架会调用哪一种`transform`方法 ,请实现与原始数据访问策略对应的`transform`方法。当然,您也可以根据`UDFParameters`解析出来的属性参数,动态决定设定哪一种策略,因此,实现两种`transform`方法也是被允许的。 - -下面是您可以设定的访问原始数据的策略: - -| 接口定义 | 描述 | 调用的`transform`方法 | -| :-------------------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ | -| `RowByRowAccessStrategy` | 逐行地处理原始数据。框架会为每一行原始数据输入调用一次`transform`方法。当 UDF 只有一个输入序列时,一行输入就是该输入序列中的一个数据点。当 UDF 有多个输入序列时,一行输入序列对应的是这些输入序列按时间对齐后的结果(一行数据中,可能存在某一列为`null`值,但不会全部都是`null`)。 | `void transform(Row row, PointCollector collector) throws Exception` | -| `SlidingTimeWindowAccessStrategy` | 以滑动时间窗口的方式处理原始数据。框架会为每一个原始数据输入窗口调用一次`transform`方法。一个窗口可能存在多行数据,每一行数据对应的是输入序列按时间对齐后的结果(一行数据中,可能存在某一列为`null`值,但不会全部都是`null`)。 | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `SlidingSizeWindowAccessStrategy` | 以固定行数的方式处理原始数据,即每个数据处理窗口都会包含固定行数的数据(最后一个窗口除外)。框架会为每一个原始数据输入窗口调用一次`transform`方法。一个窗口可能存在多行数据,每一行数据对应的是输入序列按时间对齐后的结果(一行数据中,可能存在某一列为`null`值,但不会全部都是`null`)。 | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `SessionTimeWindowAccessStrategy` | 以会话窗口的方式处理原始数据,框架会为每一个原始数据输入窗口调用一次`transform`方法。一个窗口可能存在多行数据,每一行数据对应的是输入序列按时间对齐后的结果(一行数据中,可能存在某一列为`null`值,但不会全部都是`null`)。 | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | -| `StateWindowAccessStrategy` | 以状态窗口的方式处理原始数据,框架会为每一个原始数据输入窗口调用一次`transform`方法。一个窗口可能存在多行数据。目前仅支持对一个物理量也就是一列数据进行开窗。 | `void transform(RowWindow rowWindow, PointCollector collector) throws Exception` | - -`RowByRowAccessStrategy`的构造不需要任何参数。 - -如图是`SlidingTimeWindowAccessStrategy`的开窗示意图。 - - -`SlidingTimeWindowAccessStrategy`有多种构造方法,您可以向构造方法提供 3 类参数: - -1. 时间轴显示时间窗开始和结束时间 -2. 划分时间轴的时间间隔参数(必须为正数) -3. 滑动步长(不要求大于等于时间间隔,但是必须为正数) - -时间轴显示时间窗开始和结束时间不是必须要提供的。当您不提供这类参数时,时间轴显示时间窗开始时间会被定义为整个查询结果集中最小的时间戳,时间轴显示时间窗结束时间会被定义为整个查询结果集中最大的时间戳。 - -滑动步长参数也不是必须的。当您不提供滑动步长参数时,滑动步长会被设定为划分时间轴的时间间隔。 - -3 类参数的关系可见下图。策略的构造方法详见 Javadoc。 - - - -注意,最后的一些时间窗口的实际时间间隔可能小于规定的时间间隔参数。另外,可能存在某些时间窗口内数据行数量为 0 的情况,这种情况框架也会为该窗口调用一次`transform`方法。 - -如图是`SlidingSizeWindowAccessStrategy`的开窗示意图。 - - -`SlidingSizeWindowAccessStrategy`有多种构造方法,您可以向构造方法提供 2 个参数: - -1. 窗口大小,即一个数据处理窗口包含的数据行数。注意,最后一些窗口的数据行数可能少于规定的数据行数。 -2. 滑动步长,即下一窗口第一个数据行与当前窗口第一个数据行间的数据行数(不要求大于等于窗口大小,但是必须为正数) - -滑动步长参数不是必须的。当您不提供滑动步长参数时,滑动步长会被设定为窗口大小。 - -如图是`SessionTimeWindowAccessStrategy`的开窗示意图。**时间间隔小于等于给定的最小时间间隔 sessionGap 则分为一组。** - - -`SessionTimeWindowAccessStrategy`有多种构造方法,您可以向构造方法提供 2 类参数: - -1. 时间轴显示时间窗开始和结束时间。 -2. 会话窗口之间的最小时间间隔。 - -如图是`StateWindowAccessStrategy`的开窗示意图。**对于数值型数据,状态差值小于等于给定的阈值 delta 则分为一组。** - - -`StateWindowAccessStrategy`有四种构造方法。 - -1. 针对数值型数据,可以提供时间轴显示时间窗开始和结束时间以及对于单个窗口内部允许变化的阈值delta。 -2. 针对文本数据以及布尔数据,可以提供时间轴显示时间窗开始和结束时间。对于这两种数据类型,单个窗口内的数据是相同的,不需要提供变化阈值。 -3. 针对数值型数据,可以只提供单个窗口内部允许变化的阈值delta,时间轴显示时间窗开始时间会被定义为整个查询结果集中最小的时间戳,时间轴显示时间窗结束时间会被定义为整个查询结果集中最大的时间戳。 -4. 针对文本数据以及布尔数据,可以不提供任何参数,开始与结束时间戳见3中解释。 - -StateWindowAccessStrategy 目前只能接收一列输入。策略的构造方法详见 Javadoc。 - - * setOutputDataType - -注意,您在此处设定的输出结果序列的类型,决定了`transform`方法中`PointCollector`实际能够接收的数据类型。`setOutputDataType`中设定的输出类型和`PointCollector`实际能够接收的数据输出类型关系如下: - -| `setOutputDataType`中设定的输出类型 | `PointCollector`实际能够接收的输出类型 | -| :---------------------------------- | :----------------------------------------------------------- | -| `INT32` | `int` | -| `INT64` | `long` | -| `FLOAT` | `float` | -| `DOUBLE` | `double` | -| `BOOLEAN` | `boolean` | -| `TEXT` | `java.lang.String` 和 `org.apache.iotdb.udf.api.type.Binary` | - -UDTF 输出序列的类型是运行时决定的。您可以根据输入序列类型动态决定输出序列类型。 - -下面是一个简单的例子: - -```java -void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) throws Exception { - // do something - // ... - - configurations - .setAccessStrategy(new RowByRowAccessStrategy()) - .setOutputDataType(parameters.getDataType(0)); -} -``` - -* void transform(Row row, PointCollector collector) throws Exception - -当您在`beforeStart`方法中指定 UDF 读取原始数据的策略为 `RowByRowAccessStrategy`,您就需要实现该方法,在该方法中增加对原始数据处理的逻辑。 - -该方法每次处理原始数据的一行。原始数据由`Row`读入,由`PointCollector`输出。您可以选择在一次`transform`方法调用中输出任意数量的数据点。需要注意的是,输出数据点的类型必须与您在`beforeStart`方法中设置的一致,而输出数据点的时间戳必须是严格单调递增的。 - -下面是一个实现了`void transform(Row row, PointCollector collector) throws Exception`方法的完整 UDF 示例。它是一个加法器,接收两列时间序列输入,当这两个数据点都不为`null`时,输出这两个数据点的代数和。 - -``` java -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Adder implements UDTF { - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(Type.INT64) - .setAccessStrategy(new RowByRowAccessStrategy()); - } - - @Override - public void transform(Row row, PointCollector collector) throws Exception { - if (row.isNull(0) || row.isNull(1)) { - return; - } - collector.putLong(row.getTime(), row.getLong(0) + row.getLong(1)); - } -} -``` - - * void transform(RowWindow rowWindow, PointCollector collector) throws Exception - -当您在`beforeStart`方法中指定 UDF 读取原始数据的策略为 `SlidingTimeWindowAccessStrategy`或者`SlidingSizeWindowAccessStrategy`时,您就需要实现该方法,在该方法中增加对原始数据处理的逻辑。 - -该方法每次处理固定行数或者固定时间间隔内的一批数据,我们称包含这一批数据的容器为窗口。原始数据由`RowWindow`读入,由`PointCollector`输出。`RowWindow`能够帮助您访问某一批次的`Row`,它提供了对这一批次的`Row`进行随机访问和迭代访问的接口。您可以选择在一次`transform`方法调用中输出任意数量的数据点,需要注意的是,输出数据点的类型必须与您在`beforeStart`方法中设置的一致,而输出数据点的时间戳必须是严格单调递增的。 - -下面是一个实现了`void transform(RowWindow rowWindow, PointCollector collector) throws Exception`方法的完整 UDF 示例。它是一个计数器,接收任意列数的时间序列输入,作用是统计并输出指定时间范围内每一个时间窗口中的数据行数。 - -```java -import java.io.IOException; -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.RowWindow; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.SlidingTimeWindowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Counter implements UDTF { - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(Type.INT32) - .setAccessStrategy(new SlidingTimeWindowAccessStrategy( - parameters.getLong("time_interval"), - parameters.getLong("sliding_step"), - parameters.getLong("display_window_begin"), - parameters.getLong("display_window_end"))); - } - - @Override - public void transform(RowWindow rowWindow, PointCollector collector) throws Exception { - if (rowWindow.windowSize() != 0) { - collector.putInt(rowWindow.windowStartTime(), rowWindow.windowSize()); - } - } -} -``` - - * void terminate(PointCollector collector) throws Exception - -在一些场景下,UDF 需要遍历完所有的原始数据后才能得到最后的输出结果。`terminate`接口为这类 UDF 提供了支持。 - -该方法会在所有的`transform`调用执行完成后,在`beforeDestory`方法执行前被调用。您可以选择使用`transform`方法进行单纯的数据处理,最后使用`terminate`将处理结果输出。 - -结果需要由`PointCollector`输出。您可以选择在一次`terminate`方法调用中输出任意数量的数据点。需要注意的是,输出数据点的类型必须与您在`beforeStart`方法中设置的一致,而输出数据点的时间戳必须是严格单调递增的。 - -下面是一个实现了`void terminate(PointCollector collector) throws Exception`方法的完整 UDF 示例。它接收一个`INT32`类型的时间序列输入,作用是输出该序列的最大值点。 - -```java -import java.io.IOException; -import org.apache.iotdb.udf.api.UDTF; -import org.apache.iotdb.udf.api.access.Row; -import org.apache.iotdb.udf.api.collector.PointCollector; -import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations; -import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters; -import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy; -import org.apache.iotdb.udf.api.type.Type; - -public class Max implements UDTF { - - private Long time; - private int value; - - @Override - public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations) { - configurations - .setOutputDataType(TSDataType.INT32) - .setAccessStrategy(new RowByRowAccessStrategy()); - } - - @Override - public void transform(Row row, PointCollector collector) { - if (row.isNull(0)) { - return; - } - int candidateValue = row.getInt(0); - if (time == null || value < candidateValue) { - time = row.getTime(); - value = candidateValue; - } - } - - @Override - public void terminate(PointCollector collector) throws IOException { - if (time != null) { - collector.putInt(time, value); - } - } -} -``` - - * void beforeDestroy() - -UDTF 的结束方法,您可以在此方法中进行一些资源释放等的操作。 - -此方法由框架调用。对于一个 UDF 类实例而言,生命周期中会且只会被调用一次,即在处理完最后一条记录之后被调用。 - -### UDAF(User Defined Aggregation Function) - -一个完整的 UDAF 定义涉及到 State 和 UDAF 两个类。 - -#### State 类 - -编写一个 State 类需要实现`org.apache.iotdb.udf.api.State`接口,下表是需要实现的方法说明。 - -| 接口定义 | 描述 | 是否必须 | -| -------------------------------- | ------------------------------------------------------------ | -------- | -| `void reset()` | 将 `State` 对象重置为初始的状态,您需要像编写构造函数一样,在该方法内填入 `State` 类中各个字段的初始值。 | 是 | -| `byte[] serialize()` | 将 `State` 序列化为二进制数据。该方法用于 IoTDB 内部的 `State` 对象传递,注意序列化的顺序必须和下面的反序列化方法一致。 | 是 | -| `void deserialize(byte[] bytes)` | 将二进制数据反序列化为 `State`。该方法用于 IoTDB 内部的 `State` 对象传递,注意反序列化的顺序必须和上面的序列化方法一致。 | 是 | - -下面将详细介绍各个接口的使用方法。 - -- void reset() - -该方法的作用是将 `State` 重置为初始的状态,您需要在该方法内填写 `State` 对象中各个字段的初始值。出于优化上的考量,IoTDB 在内部会尽可能地复用 `State`,而不是为每一个组创建一个新的 `State`,这样会引入不必要的开销。当 `State` 更新完一个组中的数据之后,就会调用这个方法重置为初始状态,以此来处理下一个组。 - -以求平均数(也就是 `avg`)的 `State` 为例,您需要数据的总和 `sum` 与数据的条数 `count`,并在 `reset()` 方法中将二者初始化为 0。 - -```java -class AvgState implements State { - double sum; - - long count; - - @Override - public void reset() { - sum = 0; - count = 0; - } - - // other methods -} -``` - -- byte[] serialize()/void deserialize(byte[] bytes) - -该方法的作用是将 State 序列化为二进制数据,和从二进制数据中反序列化出 State。IoTDB 作为分布式数据库,涉及到在不同节点中传递数据,因此您需要编写这两个方法,来实现 State 在不同节点中的传递。注意序列化和反序列的顺序必须一致。 - -还是以求平均数(也就是求 avg)的 State 为例,您可以通过任意途径将 State 的内容转化为 `byte[]` 数组,以及从 `byte[]` 数组中读取出 State 的内容,下面展示的是用 Java8 引入的 `ByteBuffer` 进行序列化/反序列的代码: - -```java -@Override -public byte[] serialize() { - ByteBuffer buffer = ByteBuffer.allocate(Double.BYTES + Long.BYTES); - buffer.putDouble(sum); - buffer.putLong(count); - - return buffer.array(); -} - -@Override -public void deserialize(byte[] bytes) { - ByteBuffer buffer = ByteBuffer.wrap(bytes); - sum = buffer.getDouble(); - count = buffer.getLong(); -} -``` - -#### UDAF 类 - -编写一个 UDAF 类需要实现`org.apache.iotdb.udf.api.UDAF`接口,下表是需要实现的方法说明。 - -| 接口定义 | 描述 | 是否必须 | -| ------------------------------------------------------------ | ------------------------------------------------------------ | -------- | -| `void validate(UDFParameterValidator validator) throws Exception` | 在初始化方法`beforeStart`调用前执行,用于检测`UDFParameters`中用户输入的参数是否合法。该方法与 UDTF 的`validate`相同。 | 否 | -| `void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception` | 初始化方法,在 UDAF 处理输入数据前,调用用户自定义的初始化行为。与 UDTF 不同的是,这里的 configuration 是 `UDAFConfiguration` 类型。 | 是 | -| `State createState()` | 创建`State`对象,一般只需要调用默认构造函数,然后按需修改默认的初始值即可。 | 是 | -| `void addInput(State state, Column[] columns, BitMap bitMap)` | 根据传入的数据`Column[]`批量地更新`State`对象,注意最后一列,也就是 `columns[columns.length - 1]` 总是代表时间列。另外`BitMap`表示之前已经被过滤掉的数据,您在编写该方法时需要手动判断对应的数据是否被过滤掉。 | 是 | -| `void combineState(State state, State rhs)` | 将`rhs`状态合并至`state`状态中。在分布式场景下,同一组的数据可能分布在不同节点上,IoTDB 会为每个节点上的部分数据生成一个`State`对象,然后调用该方法合并成完整的`State`。 | 是 | -| `void outputFinal(State state, ResultValue resultValue)` | 根据`State`中的数据,计算出最终的聚合结果。注意根据聚合的语义,每一组只能输出一个值。 | 是 | -| `void beforeDestroy() ` | UDAF 的结束方法。此方法由框架调用,并且只会被调用一次,即在处理完最后一条记录之后被调用。 | 否 | - -在一个完整的 UDAF 实例生命周期中,各个方法的调用顺序如下: - -1. `State createState()` -2. `void validate(UDFParameterValidator validator) throws Exception` -3. `void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception` -4. `void addInput(State state, Column[] columns, BitMap bitMap)` -5. `void combineState(State state, State rhs)` -6. `void outputFinal(State state, ResultValue resultValue)` -7. `void beforeDestroy()` - -和 UDTF 类似,框架每执行一次 UDAF 查询,都会构造一个全新的 UDF 类实例,查询结束时,对应的 UDF 类实例即被销毁,因此不同 UDAF 查询(即使是在同一个 SQL 语句中)UDF 类实例内部的数据都是隔离的。您可以放心地在 UDAF 中维护一些状态数据,无需考虑并发对 UDF 类实例内部状态数据的影响。 - -下面将详细介绍各个接口的使用方法。 - - * void validate(UDFParameterValidator validator) throws Exception - -同 UDTF, `validate`方法能够对用户输入的参数进行验证。 - -您可以在该方法中限制输入序列的数量和类型,检查用户输入的属性或者进行自定义逻辑的验证。 - - * void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception - - `beforeStart`方法的作用 UDAF 相同: - - 1. 帮助用户解析 SQL 语句中的 UDF 参数 - 2. 配置 UDF 运行时必要的信息,即指定 UDF 访问原始数据时采取的策略和输出结果序列的类型 - 3. 创建资源,比如建立外部链接,打开文件等。 - -其中,`UDFParameters` 类型的作用可以参照上文。 - -##### UDAFConfigurations - -和 UDTF 的区别在于,UDAF 使用了 `UDAFConfigurations` 作为 `configuration` 对象的类型。 - -目前,该类仅支持设置输出数据的类型。 - -```java -void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { - // parameters - // ... - - // configurations - configurations - .setOutputDataType(Type.INT32); -} -``` - -`setOutputDataType` 中设定的输出类型和 `ResultValue` 实际能够接收的数据输出类型关系如下: - -| `setOutputDataType`中设定的输出类型 | `ResultValue`实际能够接收的输出类型 | -| :---------------------------------- | :------------------------------------- | -| `INT32` | `int` | -| `INT64` | `long` | -| `FLOAT` | `float` | -| `DOUBLE` | `double` | -| `BOOLEAN` | `boolean` | -| `TEXT` | `org.apache.iotdb.udf.api.type.Binary` | - -UDAF 输出序列的类型也是运行时决定的。您可以根据输入序列类型动态决定输出序列类型。 - -下面是一个简单的例子: - -```java -void beforeStart(UDFParameters parameters, UDAFConfigurations configurations) throws Exception { - // do something - // ... - - configurations - .setOutputDataType(parameters.getDataType(0)); -} -``` - -- State createState() - -为 UDAF 创建并初始化 `State`。由于 Java 语言本身的限制,您只能调用 `State` 类的默认构造函数。默认构造函数会为类中所有的字段赋一个默认的初始值,如果该初始值并不符合您的要求,您需要在这个方法内进行手动的初始化。 - -下面是一个包含手动初始化的例子。假设您要实现一个累乘的聚合函数,`State` 的初始值应该设置为 1,但是默认构造函数会初始化为 0,因此您需要在调用默认构造函数之后,手动对 `State` 进行初始化: - -```java -public State createState() { - MultiplyState state = new MultiplyState(); - state.result = 1; - return state; -} -``` - -- void addInput(State state, Column[] columns, BitMap bitMap) - -该方法的作用是,通过原始的输入数据来更新 `State` 对象。出于性能上的考量,也是为了和 IoTDB 向量化的查询引擎相对齐,原始的输入数据不再是一个数据点,而是列的数组 `Column[]`。注意最后一列(也就是 `columns[columns.length - 1]` )总是时间列,因此您也可以在 UDAF 中根据时间进行不同的操作。 - -由于输入参数的类型不是一个数据点,而是多个列,您需要手动对列中的部分数据进行过滤处理,这就是第三个参数 `BitMap` 存在的意义。它用来标识这些列中哪些数据被过滤掉了,您在任何情况下都无需考虑被过滤掉的数据。 - -下面是一个用于统计数据条数(也就是 count)的 `addInput()` 示例。它展示了您应该如何使用 `BitMap` 来忽视那些已经被过滤掉的数据。注意还是由于 Java 语言本身的限制,您需要在方法的开头将接口中定义的 `State` 类型强制转化为自定义的 `State` 类型,不然后续无法正常使用该 `State` 对象。 - -```java -public void addInput(State state, Column[] columns, BitMap bitMap) { - CountState countState = (CountState) state; - - int count = columns[0].getPositionCount(); - for (int i = 0; i < count; i++) { - if (bitMap != null && !bitMap.isMarked(i)) { - continue; - } - if (!columns[0].isNull(i)) { - countState.count++; - } - } -} -``` - -- void combineState(State state, State rhs) - -该方法的作用是合并两个 `State`,更加准确的说,是用第二个 `State` 对象来更新第一个 `State` 对象。IoTDB 是分布式数据库,同一组的数据可能分布在多个不同的节点上。出于性能考虑,IoTDB 会为每个节点上的部分数据先进行聚合成 `State`,然后再将不同节点上的、属于同一个组的 `State` 进行合并,这就是 `combineState` 的作用。 - -下面是一个用于求平均数(也就是 avg)的 `combineState()` 示例。和 `addInput` 类似,您都需要在开头对两个 `State` 进行强制类型转换。另外需要注意是用第二个 `State` 的内容来更新第一个 `State` 的值。 - -```java -public void combineState(State state, State rhs) { - AvgState avgState = (AvgState) state; - AvgState avgRhs = (AvgState) rhs; - - avgState.count += avgRhs.count; - avgState.sum += avgRhs.sum; -} -``` - -- void outputFinal(State state, ResultValue resultValue) - -该方法的作用是从 `State` 中计算出最终的结果。您需要访问 `State` 中的各个字段,求出最终的结果,并将最终的结果设置到 `ResultValue` 对象中。IoTDB 内部会为每个组在最后调用一次这个方法。注意根据聚合的语义,最终的结果只能是一个值。 - -下面还是一个用于求平均数(也就是 avg)的 `outputFinal` 示例。除了开头的强制类型转换之外,您还将看到 `ResultValue` 对象的具体用法,即通过 `setXXX`(其中 `XXX` 是类型名)来设置最后的结果。 - -```java -public void outputFinal(State state, ResultValue resultValue) { - AvgState avgState = (AvgState) state; - - if (avgState.count != 0) { - resultValue.setDouble(avgState.sum / avgState.count); - } else { - resultValue.setNull(); - } -} -``` - - * void beforeDestroy() - -UDAF 的结束方法,您可以在此方法中进行一些资源释放等的操作。 - -此方法由框架调用。对于一个 UDF 类实例而言,生命周期中会且只会被调用一次,即在处理完最后一条记录之后被调用。 - -### 完整 Maven 项目示例 - -如果您使用 [Maven](http://search.maven.org/),可以参考我们编写的示例项目**udf-example**。您可以在 [这里](https://github.com/apache/iotdb/tree/master/example/udf) 找到它。 - -### UDF 注册 - -注册一个 UDF 可以按如下流程进行: - -1. 实现一个完整的 UDF 类,假定这个类的全类名为`org.apache.iotdb.udf.UDTFExample` -2. 将项目打成 JAR 包,如果您使用 Maven 管理项目,可以参考上述 Maven 项目示例的写法 -3. 进行注册前的准备工作,根据注册方式的不同需要做不同的准备,具体可参考以下例子 -4. 使用以下 SQL 语句注册 UDF - -```sql -CREATE FUNCTION AS (USING URI URI-STRING)? -``` - -#### 示例:注册名为`example`的 UDF,以下两种注册方式任选其一即可 - -##### 不指定URI - -准备工作: -使用该种方式注册时,您需要提前将 JAR 包放置到目录 `iotdb-server-1.X.X-all-bin/ext/udf`(该目录可配置) 下。 -**注意,如果您使用的是集群,那么需要将 JAR 包放置到所有 DataNode 的该目录下** - -注册语句: - -```sql -CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' -``` - -##### 指定URI - -准备工作: -使用该种方式注册时,您需要提前将 JAR 包上传到 URI 服务器上并确保执行注册语句的 IoTDB 实例能够访问该 URI 服务器。 -**注意,您无需手动放置 JAR 包,IoTDB 会下载 JAR 包并正确同步到整个集群** - -注册语句: - -```sql -CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' -``` - -#### 注意 - -由于 IoTDB 的 UDF 是通过反射技术动态装载的,因此您在装载过程中无需启停服务器。 - -UDF 函数名称是大小写不敏感的。 - -请不要给 UDF 函数注册一个内置函数的名字。使用内置函数的名字给 UDF 注册会失败。 - -不同的 JAR 包中最好不要有全类名相同但实现功能逻辑不一样的类。例如 UDF(UDAF/UDTF):`udf1`、`udf2`分别对应资源`udf1.jar`、`udf2.jar`。如果两个 JAR 包里都包含一个`org.apache.iotdb.udf.UDTFExample`类,当同一个 SQL 中同时使用到这两个 UDF 时,系统会随机加载其中一个类,导致 UDF 执行行为不一致。 - -### UDF 卸载 - -卸载 UDF 的 SQL 语法如下: - -```sql -DROP FUNCTION -``` - -可以通过如下 SQL 语句卸载上面例子中的 UDF: - -```sql -DROP FUNCTION example -``` - -### UDF 查询 - -UDF 的使用方法与普通内建函数的类似。 - -#### 支持的基础 SQL 语法 - -* `SLIMIT` / `SOFFSET` -* `LIMIT` / `OFFSET` -* 支持值过滤 -* 支持时间过滤 - - -#### 带 * 查询 - -假定现在有时间序列 `root.sg.d1.s1`和 `root.sg.d1.s2`。 - -* **执行`SELECT example(*) from root.sg.d1`** - -那么结果集中将包括`example(root.sg.d1.s1)`和`example(root.sg.d1.s2)`的结果。 - -* **执行`SELECT example(s1, *) from root.sg.d1`** - -那么结果集中将包括`example(root.sg.d1.s1, root.sg.d1.s1)`和`example(root.sg.d1.s1, root.sg.d1.s2)`的结果。 - -* **执行`SELECT example(*, *) from root.sg.d1`** - -那么结果集中将包括`example(root.sg.d1.s1, root.sg.d1.s1)`,`example(root.sg.d1.s2, root.sg.d1.s1)`,`example(root.sg.d1.s1, root.sg.d1.s2)` 和 `example(root.sg.d1.s2, root.sg.d1.s2)`的结果。 - -#### 带自定义输入参数的查询 - -您可以在进行 UDF 查询的时候,向 UDF 传入任意数量的键值对参数。键值对中的键和值都需要被单引号或者双引号引起来。注意,键值对参数只能在所有时间序列后传入。下面是一组例子: - -``` sql -SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; -SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; -``` - -#### 与其他查询的嵌套查询 - -``` sql -SELECT s1, s2, example(s1, s2) FROM root.sg.d1; -SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; -SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; -SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; -``` - -### 查看所有注册的 UDF - -``` sql -SHOW FUNCTIONS -``` - -### 用户权限管理 - -用户在使用 UDF 时会涉及到 1 种权限:`USE_UDF` - -* 具备该权限的用户才被允许执行 UDF 注册操作 -* 具备该权限的用户才被允许执行 UDF 卸载操作 -* 具备该权限的用户才被允许使用 UDF 进行查询 - -更多用户权限相关的内容,请参考 [权限管理语句](./Authority-Management.md##权限管理)。 - -### 配置项 - -使用配置项 `udf_lib_dir` 来配置 udf 的存储目录. -在 SQL 语句中使用自定义函数时,可能提示内存不足。这种情况下,您可以通过更改配置文件`iotdb-common.properties`中的`udf_initial_byte_array_length_for_memory_control`,`udf_memory_budget_in_mb`和`udf_reader_transformer_collector_memory_proportion`并重启服务来解决此问题。 - -### 贡献 UDF - - - -该部分主要讲述了外部用户如何将自己编写的 UDF 贡献给 IoTDB 社区。 - -#### 前提条件 - -1. UDF 具有通用性。 - - 通用性主要指的是:UDF 在某些业务场景下,可以被广泛使用。换言之,就是 UDF 具有复用价值,可被社区内其他用户直接使用。 - - 如果您不确定自己写的 UDF 是否具有通用性,可以发邮件到 `dev@iotdb.apache.org` 或直接创建 ISSUE 发起讨论。 - -2. UDF 已经完成测试,且能够正常运行在用户的生产环境中。 - -#### 贡献清单 - -1. UDF 的源代码 -2. UDF 的测试用例 -3. UDF 的使用说明 - -##### 源代码 - -1. 在`iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin`中创建 UDF 主类和相关的辅助类。 -2. 在`iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/udf/builtin/BuiltinTimeSeriesGeneratingFunction.java`中注册您编写的 UDF。 - -##### 测试用例 - -您至少需要为您贡献的 UDF 编写集成测试。 - -您可以在`integration-test/src/test/java/org/apache/iotdb/db/it/udf`中为您贡献的 UDF 新增一个测试类进行测试。 - -##### 使用说明 - -使用说明需要包含:UDF 的名称、UDF 的作用、执行函数必须的属性参数、函数的适用的场景以及使用示例等。 - -使用说明需包含中英文两个版本。应分别在 `docs/zh/UserGuide/Operation Manual/DML Data Manipulation Language.md` 和 `docs/UserGuide/Operation Manual/DML Data Manipulation Language.md` 中新增使用说明。 - -#### 提交 PR - -当您准备好源代码、测试用例和使用说明后,就可以将 UDF 贡献到 IoTDB 社区了。在 [Github](https://github.com/apache/iotdb) 上面提交 Pull Request (PR) 即可。具体提交方式见:[贡献指南](https://iotdb.apache.org/zh/Community/Development-Guide.html)。 - -当 PR 评审通过并被合并后,您的 UDF 就已经贡献给 IoTDB 社区了! - -### 已知实现的UDF - -#### 内置UDF - -1. [Aggregate Functions](../Reference/Function-and-Expression.md#聚合函数) 聚合函数 -2. [Arithmetic Operators and Functions](../Reference/Function-and-Expression.md#算数运算符) 算数函数 -3. [Comparison Operators and Functions](../Reference/Function-and-Expression.md#比较运算符和函数) 比较函数 -4. [String Processing](../Reference/Function-and-Expression.md#字符串处理) 字符串处理函数 -5. [Data Type Conversion Function](../Reference/Function-and-Expression.md#数据类型转换) 数据类型转换函数 -6. [Constant Timeseries Generating Functions](../Reference/Function-and-Expression.md#常序列生成函数) 常序列生成函数 -7. [Selector Functions](../Reference/Function-and-Expression.md#选择函数) 选择函数 -8. [Continuous Interval Functions](../Reference/Function-and-Expression.md#区间查询函数) 区间查询函数 -9. [Variation Trend Calculation Functions](../Reference/Function-and-Expression.md#趋势计算函数) 趋势计算函数 -10. [Sample Functions](../Reference/Function-and-Expression.md#采样函数) 采样函数 -11. [Time-Series](../Reference/Function-and-Expression.md#时间序列处理) 时间序列处理函数 - -#### 数据质量函数库 - -##### 关于 - -对基于时序数据的应用而言,数据质量至关重要。基于用户自定义函数能力,IoTDB 提供了一系列关于数据质量的函数,包括数据画像、数据质量评估与修复等,能够满足工业领域对数据质量的需求。 - -##### 快速上手 - -**该函数库中的函数不是内置函数,使用前要先加载到系统中。** 操作流程如下: - -1. 在 iotdb 根目录下执行编译指令; - ``` - mvn clean package -pl library-udf -am -DskipTests -Pget-jar-with-dependencies - ``` -2. 将在 target 下生成的带依赖的 jar 包复制到 IoTDB 程序目录的 `ext\udf` 目录下(若您使用的是集群,请将jar包复制到所有DataNode的该目录下),如下图所示; -![](https://alioss.timecho.com/docs/img/20230814-191908.jpg) -3. 下载注册脚本:[linux](https://alioss.timecho.com/docs/img/register-UDF.sh), [windows](https://alioss.timecho.com/docs/img/register-UDF.bat); -4. 将注册脚本复制到 IoTDB 的`sbin`目录下,修改脚本中的参数(默认为host=127.0.0.1,rpcPort=6667,user=root,pass=root); -5. 启动 IoTDB 服务; -6. 运行注册脚本`register-UDF.sh`以注册 UDF。 - -##### 已经实现的函数 - -1. [Data-Quality](../Reference/UDF-Libraries.md#数据质量) 数据质量 -2. [Data-Profiling](../Reference/UDF-Libraries.md#数据画像) 数据画像 -3. [Anomaly-Detection](../Reference/UDF-Libraries.md#异常检测) 异常检测 -4. [Frequency-Domain](../Reference/UDF-Libraries.md#频域分析) 频域分析 -5. [Data-Matching](../Reference/UDF-Libraries.md#数据匹配) 数据匹配 -6. [Data-Repairing](../Reference/UDF-Libraries.md#数据修复) 数据修复 -7. [Series-Discovery](../Reference/UDF-Libraries.md#序列发现) 序列发现 -8. [Machine-Learning](../Reference/UDF-Libraries.md#机器学习) 机器学习 - -### Q&A - -Q1: 如何修改已经注册的 UDF? - -A1: 假设 UDF 的名称为`example`,全类名为`org.apache.iotdb.udf.UDTFExample`,由`example.jar`引入 - -1. 首先卸载已经注册的`example`函数,执行`DROP FUNCTION example` -2. 删除 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下的`example.jar` -3. 修改`org.apache.iotdb.udf.UDTFExample`中的逻辑,重新打包,JAR 包的名字可以仍然为`example.jar` -4. 将新的 JAR 包上传至 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下 -5. 装载新的 UDF,执行`CREATE FUNCTION example AS "org.apache.iotdb.udf.UDTFExample"` \ No newline at end of file diff --git a/src/zh/UserGuide/latest/User-Manual/User-defined-function.md b/src/zh/UserGuide/latest/User-Manual/User-defined-function.md new file mode 100644 index 000000000..cbc2681e1 --- /dev/null +++ b/src/zh/UserGuide/latest/User-Manual/User-defined-function.md @@ -0,0 +1,209 @@ +# 用户自定义函数 + +## 1. UDF 介绍 + +UDF(User Defined Function)即用户自定义函数,IoTDB 提供多种内建的面向时序处理的函数,也支持扩展自定义函数来满足更多的计算需求。 + +IoTDB 支持两种类型的 UDF 函数,如下表所示。 + + + + + + + + + + + + + + + + + + + + + +
UDF 分类数据访问策略描述
UDTFMAPPABLE_ROW_BY_ROW自定义标量函数,输入 k 列时间序列 1 行数据,输出 1 列时间序列 1 行数据,可用于标量函数出现的任何子句和表达式中,如select子句、where子句等。
ROW_BY_ROW
SLIDING_TIME_WINDOW
SLIDING_SIZE_WINDOW
SESSION_TIME_WINDOW
STATE_WINDOW
自定义时间序列生成函数,输入 k 列时间序列 m 行数据,输出 1 列时间序列 n 行数据,输入行数 m 可以与输出行数 n 不相同,只能用于SELECT子句中。
UDAF-自定义聚合函数,输入 k 列时间序列 m 行数据,输出 1 列时间序列 1 行数据,可用于聚合函数出现的任何子句和表达式中,如select子句、having子句等。
+ +### 1.1 UDF 使用 + +UDF 的使用方法与普通内建函数类似,可以直接在 SELECT 语句中像调用普通函数一样使用UDF。 + +#### 1.支持的基础 SQL 语法 + +* `SLIMIT` / `SOFFSET` +* `LIMIT` / `OFFSET` +* 支持值过滤 +* 支持时间过滤 + + +#### 2. 带 * 查询 + +假定现在有时间序列 `root.sg.d1.s1`和 `root.sg.d1.s2`。 + +* **执行`SELECT example(*) from root.sg.d1`** + +那么结果集中将包括`example(root.sg.d1.s1)`和`example(root.sg.d1.s2)`的结果。 + +* **执行`SELECT example(s1, *) from root.sg.d1`** + +那么结果集中将包括`example(root.sg.d1.s1, root.sg.d1.s1)`和`example(root.sg.d1.s1, root.sg.d1.s2)`的结果。 + +* **执行`SELECT example(*, *) from root.sg.d1`** + +那么结果集中将包括`example(root.sg.d1.s1, root.sg.d1.s1)`,`example(root.sg.d1.s2, root.sg.d1.s1)`,`example(root.sg.d1.s1, root.sg.d1.s2)` 和 `example(root.sg.d1.s2, root.sg.d1.s2)`的结果。 + +#### 3. 带自定义输入参数的查询 + +可以在进行 UDF 查询的时候,向 UDF 传入任意数量的键值对参数。键值对中的键和值都需要被单引号或者双引号引起来。注意,键值对参数只能在所有时间序列后传入。下面是一组例子: + + 示例: +``` sql +SELECT example(s1, 'key1'='value1', 'key2'='value2'), example(*, 'key3'='value3') FROM root.sg.d1; +SELECT example(s1, s2, 'key1'='value1', 'key2'='value2') FROM root.sg.d1; +``` + +#### 4. 与其他查询的嵌套查询 + + 示例: +``` sql +SELECT s1, s2, example(s1, s2) FROM root.sg.d1; +SELECT *, example(*) FROM root.sg.d1 DISABLE ALIGN; +SELECT s1 * example(* / s1 + s2) FROM root.sg.d1; +SELECT s1, s2, s1 + example(s1, s2), s1 - example(s1 + example(s1, s2) / s2) FROM root.sg.d1; +``` + + +## 2. UDF 开发 + +可以参考 UDF函数开发:[开发指导](../Reference/UDF-development.md) + +## 3. UDF 管理 + +### 3.1 UDF 注册 + +注册一个 UDF 可以按如下流程进行: + +1. 实现一个完整的 UDF 类,假定这个类的全类名为`org.apache.iotdb.udf.UDTFExample` +2. 将项目打成 JAR 包,如果使用 Maven 管理项目,可以参考 [Maven 项目示例](https://github.com/apache/iotdb/tree/master/example/udf)的写法 +3. 进行注册前的准备工作,根据注册方式的不同需要做不同的准备,具体可参考以下例子 +4. 使用以下 SQL 语句注册 UDF + +```sql +CREATE FUNCTION AS (USING URI URI-STRING) +``` + +#### 示例:注册名为`example`的 UDF,以下两种注册方式任选其一即可 + +#### 方式一:手动放置jar包 + +准备工作: +使用该种方式注册时,需要提前将 JAR 包放置到集群所有 DataNode 的 `ext/udf`目录下(该目录可配置)。 + +注册语句: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' +``` + +#### 方式二:集群通过URI自动安装jar包 + +准备工作: +使用该种方式注册时,需要提前将 JAR 包上传到 URI 服务器上并确保执行注册语句的 IoTDB 实例能够访问该 URI 服务器。 + +注册语句: + +```sql +CREATE FUNCTION example AS 'org.apache.iotdb.udf.UDTFExample' USING URI 'http://jar/example.jar' +``` + +IoTDB 会下载 JAR 包并同步到整个集群。 + +#### 注意 + +1. 由于 IoTDB 的 UDF 是通过反射技术动态装载的,因此在装载过程中无需启停服务器。 + +2. UDF 函数名称是大小写不敏感的。 + +3. 请不要给 UDF 函数注册一个内置函数的名字。使用内置函数的名字给 UDF 注册会失败。 + +4. 不同的 JAR 包中最好不要有全类名相同但实现功能逻辑不一样的类。例如 UDF(UDAF/UDTF):`udf1`、`udf2`分别对应资源`udf1.jar`、`udf2.jar`。如果两个 JAR 包里都包含一个`org.apache.iotdb.udf.UDTFExample`类,当同一个 SQL 中同时使用到这两个 UDF 时,系统会随机加载其中一个类,导致 UDF 执行行为不一致。 + +### 3.2 UDF 卸载 + +SQL 语法如下: + +```sql +DROP FUNCTION +``` + +示例:卸载上述例子的 UDF: + +```sql +DROP FUNCTION example +``` + + +### 3.3 查看所有注册的 UDF + +``` sql +SHOW FUNCTIONS +``` + +### 3.4 UDF 配置 + +- 允许在 `iotdb-common.properties` 中配置 udf 的存储目录.: + ``` Properties +# UDF lib dir + +udf_lib_dir=ext/udf +``` + +- 使用自定义函数时,提示内存不足,更改 `iotdb-common.properties` 中下述配置参数并重启服务。 + ``` Properties + +# Used to estimate the memory usage of text fields in a UDF query. +# It is recommended to set this value to be slightly larger than the average length of all text +# effectiveMode: restart +# Datatype: int +udf_initial_byte_array_length_for_memory_control=48 + +# How much memory may be used in ONE UDF query (in MB). +# The upper limit is 20% of allocated memory for read. +# effectiveMode: restart +# Datatype: float +udf_memory_budget_in_mb=30.0 + +# UDF memory allocation ratio. +# The parameter form is a:b:c, where a, b, and c are integers. +# effectiveMode: restart +udf_reader_transformer_collector_memory_proportion=1:1:1 +``` + +### 3.5 UDF 用户权限 + +用户在使用 UDF 时会涉及到 `USE_UDF` 权限,具备该权限的用户才被允许执行 UDF 注册、卸载和查询操作。 + +更多用户权限相关的内容,请参考 [权限管理语句](./Authority-Management.md##权限管理)。 + + +## 4. UDF 函数库 + +基于用户自定义函数能力,IoTDB 提供了一系列关于时序数据处理的函数,包括数据质量、数据画像、异常检测、 频域分析、数据匹配、数据修复、序列发现、机器学习等,能够满足工业领域对时序数据处理的需求。 + +可以参考 [UDF 函数库](../Reference/UDF-Libraries.md)文档,查找安装步骤及每个函数对应的注册语句,以确保正确注册所有需要的函数。 + +## 5. 常见问题: + +1. 如何修改已经注册的 UDF? + +答:假设 UDF 的名称为`example`,全类名为`org.apache.iotdb.udf.UDTFExample`,由`example.jar`引入 + +1. 首先卸载已经注册的`example`函数,执行`DROP FUNCTION example` +2. 删除 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下的`example.jar` +3. 修改`org.apache.iotdb.udf.UDTFExample`中的逻辑,重新打包,JAR 包的名字可以仍然为`example.jar` +4. 将新的 JAR 包上传至 `iotdb-server-1.0.0-all-bin/ext/udf` 目录下 +5. 装载新的 UDF,执行`CREATE FUNCTION example AS "org.apache.iotdb.udf.UDTFExample"` \ No newline at end of file