From ca478cd4560ac7d42ea826c5445234111058c24f Mon Sep 17 00:00:00 2001 From: Dmitry Date: Thu, 17 Nov 2022 18:50:34 +0100 Subject: [PATCH] Rework project structure, update arguments, update math. --- .gitignore | 3 + Dockerfile | 8 +- README.md | 105 +++- _version.py | 1 + calculate_ephemerality.py | 21 - ephemerality.py | 89 +++ requirements.txt | 2 +- setup.py | 30 + src/__init__.py | 4 +- src/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 252 bytes src/__pycache__/__init__.cpython-38.pyc | Bin 0 -> 245 bytes .../ephemerality_computation.cpython-310.pyc | Bin 0 -> 4214 bytes src/ephemerality.py | 36 -- src/ephemerality_computation.py | 136 +++++ test/test_ephemerality.py | 511 +++++++++++++++++- 15 files changed, 864 insertions(+), 82 deletions(-) create mode 100644 .gitignore create mode 100644 _version.py delete mode 100644 calculate_ephemerality.py create mode 100644 ephemerality.py create mode 100644 setup.py create mode 100644 src/__pycache__/__init__.cpython-310.pyc create mode 100644 src/__pycache__/__init__.cpython-38.pyc create mode 100644 src/__pycache__/ephemerality_computation.cpython-310.pyc delete mode 100644 src/ephemerality.py create mode 100644 src/ephemerality_computation.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e72f27 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/tmp/ +/test/.pytest_cache/ +*.json diff --git a/Dockerfile b/Dockerfile index 919aae9..51d8a5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,12 @@ -FROM python:3.8.6 +FROM python:3.9.15 ADD src /src ADD test /test -ADD calculate_ephemerality.py / +ADD ephemerality.py / ADD requirements.txt / +ADD _version.py / +ADD setup.py / RUN pip install -r requirements.txt -ENTRYPOINT ["python", "calculate_ephemerality.py"] +ENTRYPOINT ["python", "ephemerality.py"] diff --git a/README.md b/README.md index a69515f..d152221 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,50 @@ # Ephemerality metric -In [[1]](#1) we formalized the ephemerality metrics used to estimate the healthiness of online discussions. It shows how 'ephemeral' topics are, that is whether the discussions are more or less uniformly active or only revolve around one or several peaks of activity. +In [[1]](#1) we formalized the ephemerality metrics used to estimate the healthiness of online discussions. It shows how +'ephemeral' topics are, that is whether the discussions are more or less uniformly active or only revolve around one or +several peaks of activity. ### Requirements -The code was tested to work with Python 3.8.6 and Numpy 1.20.3, but is expected to also run on their previous versions. +The code was tested to work with Python 3.8.6 and Numpy 1.21.5, but is expected to also run on their older versions. ## How to run the experiments -The code can be run directly via the calculate_ephemerality.py script or via a Docker container built with the provided Dockerfile. +The code can be run directly via the calculate_ephemerality.py script or via a Docker container built with the provided +Dockerfile. ### Input The script/container expect the following input arguments: -* **Frequency vector file**. The file should contain a vector of numbers in csv format. It does not need to be normalized, if it is not --- it will be done automatically. -* **Output file**. Optional. If it is provided, the results will be written into this file in JSON format. +* **Frequency vector file**. `[-i PATH, --input PATH]` _Optional_. Path to a file containing one or several arrays of +numbers in csv format (one array per line), representing temporal frequency vectors. They do not need to be normalized: +if they are not --- they will be normalized automatically. +* **Frequency vector**. _Optional_. If input file is not provided, a frequency vector is expected as a positional +argument (either comma- or space-separated). +* **Output file**. `[-o PATH, --output PATH]` _Optional_. If it is provided, the results will be written into this file +in JSON format. +* **Threshold**. `[-t FLOAT, -threshold FLOAT]` _Optional_. Threshold value for ephemerality computations. Defaults +to 0.8. +* **Print**. `[-p, --print]`. _Optional_. If output file is provided, forces the results to still be printed to stdout. ### Output -The results are printed to STDOUT in **(ε2, ε4)** format. Additionally, if the output file was specified among the input arguments, the results will also be written into this file in JSON format. +If no output file specified or `-p` option is used, results are printed to STDOUT in **[εorigorig_span εfiltered εfiltered_span εsorted εsorted_span]** +format, one line per each line of input file (or a single line for command line input). + +If the output file was specified among the input arguments, the results will be written into that file in JSON format as +a list of dictionaries, one per input line: + +``` +[ + { + "ephemerality_original": FLOAT, + "ephemerality_original_span": INT, + "ephemerality_filtered": FLOAT, + "ephemerality_filtered_span": INT, + "ephemerality_sorted": FLOAT, + "ephemerality_sorted_span": INT + }, + ... +] +``` ### Example @@ -25,33 +55,82 @@ Input file `test_input.csv`: #### Python execution: +Input 1: + ``` -python calculate_ephemerality.py ./test_input.csv ./test_output.json +python ephemerality.py -i tmp/test_input.csv -t 0.8 --output tmp/test_output.json -P ``` -Output: +Output 1: ``` -0.2, 0.25 +0.1250000000000001 7 0.5 4 0.625 3 +0.2500000000000001 3 0.5 2 0.5 2 ``` `test_output.json` content: ``` -{"ephemerality2": 0.2, "ephemerality4": 0.25} +[ + { + "ephemerality_original": 0.1250000000000001, + "ephemerality_original_span": 7, + "ephemerality_filtered": 0.5, + "ephemerality_filtered_span": 4, + "ephemerality_sorted": 0.625, + "ephemerality_sorted_span": 3 + }, + { + "ephemerality_original": 0.2500000000000001, + "ephemerality_original_span": 3, + "ephemerality_filtered": 0.5, + "ephemerality_filtered_span": 2, + "ephemerality_sorted": 0.5, + "ephemerality_sorted_span": 2 + } +] +``` + +Input 2: + +``` +python ephemerality.py 0.0 0.0 0.0 0.2 0.55 0.0 0.15 0.1 0.0 0.0 -t 0.5 +``` + +Output 2: +``` +0.0 5 0.8 1 0.8 1 ``` #### Docker execution ``` -docker run -a STDOUT -v [PATH_TO_FOLDER]/tmp/:/tmp/ ephemerality:0.1 /tmp/test_input.csv /tmp/test_output.json +docker run -a STDOUT -v [PATH_TO_FOLDER]/tmp/:/tmp/ ephemerality:1.0.0 -i /tmp/test_input.csv -o /tmp/test_output.json -t 0.5 -p ``` Output: ``` -0.2, 0.25 +0.0 5 0.8 1 0.8 1 +0.19999999999999996 2 0.6 1 0.6 1 ``` `test_output.json` content: ``` -{"ephemerality2": 0.2, "ephemerality4": 0.25} +[ + { + "ephemerality_original": 0.0, + "ephemerality_original_span": 5, + "ephemerality_filtered": 0.8, + "ephemerality_filtered_span": 1, + "ephemerality_sorted": 0.8, + "ephemerality_sorted_span": 1 + }, + { + "ephemerality_original": 0.19999999999999996, + "ephemerality_original_span": 2, + "ephemerality_filtered": 0.6, + "ephemerality_filtered_span": 1, + "ephemerality_sorted": 0.6, + "ephemerality_sorted_span": 1 + } +] ``` diff --git a/_version.py b/_version.py new file mode 100644 index 0000000..d538f87 --- /dev/null +++ b/_version.py @@ -0,0 +1 @@ +__version__ = "1.0.0" \ No newline at end of file diff --git a/calculate_ephemerality.py b/calculate_ephemerality.py deleted file mode 100644 index daec2bf..0000000 --- a/calculate_ephemerality.py +++ /dev/null @@ -1,21 +0,0 @@ -import sys -import json -import numpy as np -from src import compute_ephemerality_measures - - -if __name__ == '__main__': - frequency_vector_file = sys.argv[1] - output_file = sys.argv[2] if (len(sys.argv) >= 3) else None - - with open(frequency_vector_file, 'r') as f: - frequency_vector = np.array(f.read().split(','), dtype=float) - - ephemeralities = compute_ephemerality_measures(frequency_vector=frequency_vector, - threshold=0.8) - - if output_file is not None: - with open(output_file, 'w+') as f: - json.dump(ephemeralities, f) - - print(f"{ephemeralities['ephemerality2']}, {ephemeralities['ephemerality4']}") diff --git a/ephemerality.py b/ephemerality.py new file mode 100644 index 0000000..2e3d53d --- /dev/null +++ b/ephemerality.py @@ -0,0 +1,89 @@ +from _version import __version__ +import sys +import json +import argparse +import numpy as np +from src import compute_ephemeralities + + +HELP_INFO = "" + + +def init_argparse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + usage="%(prog)s [FREQUENCY_VECTOR] [-h] [-v] [-i INPUT_FILE] [-o OUTPUT_FILE.json] [-t THRESHOLD]...", + description="Calculate ephemerality for a given vector of frequencies." + ) + parser.add_argument( + "-v", "--version", action="version", + version=f"{parser.prog} version {__version__}" + ) + parser.add_argument( + "-p", "--print", action="store_true", + help="If output file is provided, forces the results to still be printed to stdout." + ) + parser.add_argument( + "-i", "--input", action="store", + help="Path to the input csv file. If not specified, will use the command line arguments " + "(delimited either by commas or spaces)." + ) + parser.add_argument( + "-o", "--output", action="store", + help="Path to the output json file. If not specified, will output ephemerality values to stdout in the" + " following format separated by a space: \"EPH_ORIG EPH_ORIG_SPAN EPH_FILT EPH_FILT_SPAN EPH_SORT " + "EPH_SORT_SPAN\"" + ) + parser.add_argument( + "-t", "--threshold", action="store", default=0.8, + help="Threshold value for ephemerality computations. Defaults to 0.8." + ) + parser.add_argument( + 'frequencies', + help='frequency vector (if the input file is not specified)', + nargs='*' + ) + return parser + + +def print_ephemeralities(ephemerality_list: list[dict]): + for ephemeralities in ephemerality_list: + print(f"{ephemeralities['ephemerality_original']} {ephemeralities['ephemerality_original_span']} " + f"{ephemeralities['ephemerality_filtered']} {ephemeralities['ephemerality_filtered_span']} " + f"{ephemeralities['ephemerality_sorted']} {ephemeralities['ephemerality_sorted_span']}") + + +if __name__ == '__main__': + parser = init_argparse() + args = parser.parse_args() + + frequency_vectors = list() + + if args.input: + with open(args.input, 'r') as f: + for line in f.readlines(): + if line.strip(): + frequency_vectors.append(np.array(line.split(','), dtype=float)) + else: + if len(args.frequencies) > 1: + frequency_vectors.append(np.array(args.frequencies, dtype=float)) + elif len(args.frequencies) == 1: + if ' ' in args.frequencies[0]: + frequency_vectors.append(np.array(args.frequencies[0].split(' '), dtype=float)) + else: + frequency_vectors.append(np.array(args.frequencies[0].split(','), dtype=float)) + else: + sys.exit('No input provided!') + + threshold = float(args.threshold) + + ephemerality_list = list() + for frequency_vector in frequency_vectors: + ephemerality_list.append(compute_ephemeralities(frequency_vector=frequency_vector, threshold=threshold)) + + if args.output: + with open(args.output, 'w+') as f: + json.dump(ephemerality_list, f, indent=2) + if args.print: + print_ephemeralities(ephemerality_list) + else: + print_ephemeralities(ephemerality_list) diff --git a/requirements.txt b/requirements.txt index 401134e..55a90f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -numpy==1.20.3 \ No newline at end of file +numpy==1.21.5 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7a31bb1 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +import os +from setuptools import setup +import re + +VERSION_FILE = "_version.py" +VERSION_REGEX = r"^__version__ = ['\"]([^'\"]*)['\"]" + + +def read(file_name): + return open(os.path.join(os.path.dirname(__file__), file_name)).read() + + +version_lines = open(VERSION_FILE, 'r').read() +match = re.search(VERSION_REGEX, version_lines, re.M) +if match: + version = match.group(1) +else: + raise RuntimeError("Unable to find version string in %s." % (VERSION_FILE,)) + +setup( + name='ephemerality', + version=version, + packages=['src', 'test'], + url='https://github.com/HPAI-BSC/ephemerality', + license='MIT', + author='HPAI BSC', + author_email='dmitry.gnatyshak@bsc.es', + description='Module for computing ephemerality metrics of temporal arrays.', + long_description=read('README.md') +) diff --git a/src/__init__.py b/src/__init__.py index 16ab027..8eb4c79 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,3 +1,3 @@ -from src.ephemerality import compute_ephemerality_measures +from src.ephemerality_computation import compute_ephemeralities -__all__ = ['compute_ephemerality_measures'] +__all__ = ['compute_ephemeralities'] diff --git a/src/__pycache__/__init__.cpython-310.pyc b/src/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d632a5fe431b8e065629bbef115bdb5e27fe9b0 GIT binary patch literal 252 zcmYjKu?@m75VS)OD1tT>3#5q`KoJwr&{6O*@dFk%j%)|SNDRO-X{lI&iYGKV>F;&A zzk4^IXN>UNUF!qw*G~+QkXX@#oJcXn4YPd9IaA4zZxc1zup)hv#|qx%wX;nd5wJO- zMK6tx8iW6ptcy{%2)-@{-W~n{8MV9gSKS0a8UyfPeoU#JK9x8*i(((@HlPDp|MgZuGjsRtI@1{%ZRBN!}R2fpD8OxF{ZwEwc literal 0 HcmV?d00001 diff --git a/src/__pycache__/__init__.cpython-38.pyc b/src/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea40a6878e06c5ed5a367b8b84ae4ea9b7393f1e GIT binary patch literal 245 zcmYk0F%E)25Jh)^L}Mhevhx5m>;(+*1UAM(n#saWfVk_DU5MdG-o~2N$}3noQ1O$Q z$=q;9$a3`*ZS^loCElQ7?TSEkF#oQx{~sWrfhxrmzc`emZF21Vh^-lG#w>+Zta mqN$YZq;S|;u_OsX3xMX)bO32PjO(MZs@IrtLi*{0ZSV&yH$lPx literal 0 HcmV?d00001 diff --git a/src/__pycache__/ephemerality_computation.cpython-310.pyc b/src/__pycache__/ephemerality_computation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..768610f6f5d9a00a0fc30bff7dd1c437b6908070 GIT binary patch literal 4214 zcmds4U2o&Y73Gjaijrlm*6a0ZyXa!P**4b3wReF61&SgcX!?-`aDi@+0B*Metr=O2 zDN;M6?AV}yB2FIj1KOrQ_Ax)Bul)&p?L*+VK5u|c&mBry$rg&BMf*_N91gE$?#w;+ z+&h=a>T1ox_4+%<-M`+ltiRLC{8h!vo4C`zp^=tkk+sk8v?F`q>^rQK@qI2GI zCAnfBdOa>(>3zcYE3%^8o-KV@#hZ^&_?2WBSM7PSCRZS@U6I%2D y3gLTRceua)?V>^TA&)M77lh*I-u`}km zo%353`<7)}3|x8YWlqk{fuJPjID4AF(hW3U@}jEZ5qTb< zknf60n0BKCJfTgsT|*>eU$N31X|3X{1BMkWEI8qO+3I^GS77QKqdJKW4?|$yNryqa z0mUh;&RG)(tg|{$s5=@dV-W=+?+k^{@d9?R7V^a%SS_8gBl(!mP}u+~?%2zn-20I= zcJQ=;#C?Mf`atJ??v6RLV)nFw*%g|Zb7_5wh@Ro$65w5?;ed*3{ z-Ce-d+tak!3-Y5u_~IK}24;O7vUYV2%WF|`qI72#6-8qT?htVTd{@#e&{-#prTR$U zB$Wi4UPn`~55FDiy)c4Tl{Bae%uK1NBPgfHbH}AhN?BTJ0^JWWKBcwR z0o?*+Hz6!Ik-Dgx)X<4ke3md_31XjOll~}@>11I1{3hn5#K9)h_wo4ZPk*HA;*ABDQiiJFHhX%Ae2~*o!?cg&3KxEi zK$8yK)SM<;3I^N`9YPY0)c2%=V|jXepL>rjI1zz_qsui%;!rC}sv_$Lv3R&?lEc+gW;!}jX(|#eP&5RQ$P)2r)9}CT6*rfr z(&9Ty#fAfyrcCp|r72T!IpxNJx|02u&ak2C!%gvCA3xGRLbr=z<5-EL2l$jw@=%Fxkm68|#YsQx_7_*B z05qECA^+_xxrzsvhr(b|iA`ek$+if0)Q%V=@YiIamzBdHfIp&8(RM^2dg^3aOsEKz zQkmW!ed7)M+quXRHUUzsES6BBrv*3oQB9(z@tQGMuvww;+Hruj+NdP z!$=VlG8r92nA2C?BeM@<;$)MXXWKuYB>zREW~I%de?Izu@Xuw(94OQrF>#HEXL$6l zT(umq|ADI>JT^zow{p~5|Cyt7py#6rv<0P31t_R9gAxS;3gb~6_D~yNt)#co1W1DU z3sL{?#b0|zEVKN-k9d34)ZnJ3C<_VtU@1RT930@L5;ccs;hzMWD3lhQdPUVs0OsaDdPXxg=xWk`8;ze+!SpPDu`^mV6SqvipcqA>>`vdrazZez(Ik6~|PYWhAJ zpZmsb}~)&9qr%&R|oH xnNKGFCxb+eBK100ZlzQP`6yR?)^ZxvR;5*It#8zAkH7Hn!2r8o)a=HKe*&6qb{+r# literal 0 HcmV?d00001 diff --git a/src/ephemerality.py b/src/ephemerality.py deleted file mode 100644 index c267bda..0000000 --- a/src/ephemerality.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -from typing import Sequence, Dict - - -def compute_ephemerality_measures( - frequency_vector: Sequence[float], - threshold: float = 0.8) -> Dict[str, float]: - frequency_vector = np.array(frequency_vector) - if sum(frequency_vector) != 1.: - frequency_vector /= np.sum(frequency_vector) - cumulative_distribution_function = np.cumsum(frequency_vector) - - lower_threshold = (1. - threshold) / 2 - upper_threshold = 1 - lower_threshold - eph_lower = (cumulative_distribution_function >= lower_threshold).argmax() - eph_upper = (cumulative_distribution_function >= upper_threshold).argmax() - topic_range = len(frequency_vector) - 1 - first_freq_index_above_zero = (frequency_vector > 0.).argmax() - last_freq_index_above_zero = topic_range - np.flip(frequency_vector > 0.).argmax() - topic_activity_range = last_freq_index_above_zero - first_freq_index_above_zero + 1 - - ephemerality_2_range = eph_upper - eph_lower + 1 - ephemerality_2 = 1 - (ephemerality_2_range / topic_activity_range) - - freq_descending_order = np.sort(frequency_vector)[::-1] - aux_cdf = np.cumsum(freq_descending_order) - eph4_thr = (aux_cdf >= threshold).argmax() + 1 - eph4_range = (freq_descending_order > 0.).argmin() + 1 - ephemerality_4 = 1 - ((1 / 0.8) * (eph4_thr / eph4_range)) - - ephemeralities = { - "ephemerality2": round(ephemerality_2, 2), - "ephemerality4": round(ephemerality_4, 2) - } - - return ephemeralities diff --git a/src/ephemerality_computation.py b/src/ephemerality_computation.py new file mode 100644 index 0000000..458a434 --- /dev/null +++ b/src/ephemerality_computation.py @@ -0,0 +1,136 @@ +import numpy as np +from typing import Sequence +import warnings + + +def _normalize_frequency_vector(frequency_vector: Sequence[float]) -> np.array: + frequency_vector = np.array(frequency_vector) + + if sum(frequency_vector) != 1.: + frequency_vector /= np.sum(frequency_vector) + + return frequency_vector + + +def ephemerality_raise_error(threshold: float): + if 0. < threshold <= 1: + raise ValueError('Input frequency vector has not been internally normalized!') + else: + raise ValueError('Threshold value is not within (0, 1] range!') + + +def compute_ephemerality_original_span(frequency_vector: np.array, threshold: float) -> int: + current_sum = 0 + for i, freq in enumerate(frequency_vector): + current_sum = current_sum + freq + if np.isclose(current_sum, threshold) or current_sum > threshold: + return i + 1 + + ephemerality_raise_error(threshold) + + +def compute_ephemerality_filtered_span(frequency_vector: np.array, threshold: float) -> int: + lower_threshold = (1. - threshold) / 2 + + current_presum = 0 + start_index = -1 + for i, freq in enumerate(frequency_vector): + current_presum += freq + if current_presum > lower_threshold and not np.isclose(current_presum, lower_threshold): + start_index = i + break + + current_sum = 0 + for j, freq in enumerate(frequency_vector[start_index:]): + current_sum += freq + if np.isclose(current_sum, threshold) or current_sum > threshold: + return j + 1 + + ephemerality_raise_error(threshold) + + +def compute_ephemerality_sorted_span(frequency_vector: np.array, threshold: float) -> int: + freq_descending_order = np.sort(frequency_vector)[::-1] + + current_sum = 0 + for i, freq in enumerate(freq_descending_order): + current_sum += freq + if np.isclose(current_sum, threshold) or current_sum > threshold: + return i + 1 + + ephemerality_raise_error(threshold) + + +def compute_ephemerality_from_span(span: int, range_length: int, threshold: float): + return 1 - (span / range_length) / threshold + + +def compute_ephemeralities( + frequency_vector: Sequence[float], + threshold: float = 0.8) -> dict[str, float]: + + if threshold <= 0.: + raise ValueError('Threshold value must be greater than 0!') + + if threshold > 1.: + raise ValueError('Threshold value must be less or equal to 1!') + + if np.isclose(np.sum(frequency_vector), 0.): + return { + 'ephemerality_original': 1., + 'ephemerality_original_span': 0, + 'ephemerality_filtered': 1., + 'ephemerality_filtered_span': 0, + 'ephemerality_sorted': 1., + 'ephemerality_sorted_span': 0 + } + + frequency_vector = _normalize_frequency_vector(frequency_vector) + range_length = len(frequency_vector) + + ephemerality_original_span = compute_ephemerality_original_span(frequency_vector, threshold) + ephemerality_original = compute_ephemerality_from_span(ephemerality_original_span, range_length, threshold) + + # print(f'Orig: {ephemerality_original}, {ephemerality_original_span}') + + if ephemerality_original < 0. and not np.isclose(ephemerality_original, 0.): + warnings.warn(f'Original ephemerality value is less than 0 ({ephemerality_original}) and is going to be rounded up! ' + f'This is indicative of the edge case in which ephemerality span is greater than ' + f'[threshold * input_vector_length], i.e. most of the frequency mass lies in a few vector ' + f'elements at the end of the frequency vector. Original ephemerality in this case should be ' + f'considered to be equal to 0. However, please double check the input vector!', + RuntimeWarning) + ephemerality_original = 0. + + ephemerality_filtered_span = compute_ephemerality_filtered_span(frequency_vector, threshold) + ephemerality_filtered = compute_ephemerality_from_span(ephemerality_filtered_span, range_length, threshold) + if ephemerality_filtered < 0. and not np.isclose(ephemerality_filtered, 0.): + warnings.warn(f'Filtered ephemerality value is less than 0 ({ephemerality_filtered}) and is going to be rounded up! ' + f'This is indicative of the edge case in which ephemerality span is greater than ' + f'[threshold * input_vector_length], i.e. most of the frequency mass lies in a few elements ' + f'at the beginning and the end of the frequency vector. Filtered ephemerality in this case should ' + f'be considered to be equal to 0. However, please double check the input vector!', + RuntimeWarning) + ephemerality_filtered = 0. + + ephemerality_sorted_span = compute_ephemerality_sorted_span(frequency_vector, threshold) + ephemerality_sorted = compute_ephemerality_from_span(ephemerality_sorted_span, range_length, threshold) + if ephemerality_sorted < 0. and not np.isclose(ephemerality_sorted, 0.): + warnings.warn(f'Sorted ephemerality value is less than 0 ({ephemerality_sorted}) and is going to be rounded up! ' + f'This is indicative of the rare edge case of very short and mostly uniform frequency vector (so ' + f'that ephemerality span is greater than [threshold * input_vector_length]). ' + f'Sorted ephemerality in this case should be considered to be equal to 0. ' + f'However, please double check the input vector!', + RuntimeWarning) + ephemerality_sorted = 0. + + ephemeralities = { + 'ephemerality_original': ephemerality_original, + 'ephemerality_original_span': ephemerality_original_span, + 'ephemerality_filtered': ephemerality_filtered, + 'ephemerality_filtered_span': ephemerality_filtered_span, + 'ephemerality_sorted': ephemerality_sorted, + 'ephemerality_sorted_span': ephemerality_sorted_span + } + + return ephemeralities diff --git a/test/test_ephemerality.py b/test/test_ephemerality.py index a0439c9..61c7dd8 100644 --- a/test/test_ephemerality.py +++ b/test/test_ephemerality.py @@ -1,11 +1,510 @@ +import warnings from unittest import TestCase -from src import compute_ephemerality_measures +from typing import Sequence +import numpy as np +from dataclasses import dataclass +import re + +from src import compute_ephemeralities + + +@dataclass +class EphemeralityTestCase: + input_vector: Sequence[float] + threshold: float + expected_output: dict + warnings: tuple[bool, bool, bool] class TestComputeEphemerality(TestCase): - def test_compute_ephemerality_measures(self): - input_frequency_vector = [0., 0., 0., .2, .55, 0., 0.15, .1, 0., 0.] - expected_ephemeralities = {'ephemerality2': 0.2, 'ephemerality4': 0.25} - actual_ephemeralities = compute_ephemerality_measures(input_frequency_vector) - self.assertEqual(expected_ephemeralities, actual_ephemeralities) + _warning_messages = [ + re.compile( + r'Original ephemerality value is less than 0 [(]-[0-9]*[.][0-9]*[)] and is going to be rounded up! ' + r'This is indicative of the edge case in which ephemerality span is greater than ' + r'\[threshold [*] input_vector_length], i[.]e[.] most of the frequency mass lies in a few vector ' + r'elements at the end of the frequency vector[.] Original ephemerality in this case should be ' + r'considered to be equal to 0[.] However, please double check the input vector!' + ), + + re.compile( + r'Filtered ephemerality value is less than 0 [(]-[0-9]*[.][0-9]*[)] and is going to be rounded up! ' + r'This is indicative of the edge case in which ephemerality span is greater than ' + r'\[threshold [*] input_vector_length], i[.]e[.] most of the frequency mass lies in a few elements ' + r'at the beginning and the end of the frequency vector[.] Filtered ephemerality in this case should ' + r'be considered to be equal to 0[.] However, please double check the input vector!' + ), + + re.compile( + r'Sorted ephemerality value is less than 0 [(]-[0-9]*[.][0-9]*[)] and is going to be rounded up! ' + r'This is indicative of the rare edge case of very short and mostly uniform frequency vector [(]so ' + r'that ephemerality span is greater than \[threshold [*] input_vector_length][)][.] ' + r'Sorted ephemerality in this case should be considered to be equal to 0[.] ' + r'However, please double check the input vector!' + ) + ] + + _test_cases = [ + EphemeralityTestCase( + input_vector=[1.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 1 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[1.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 1 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[1., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.375, 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0.375, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.375, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[1., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 1 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[0., 1.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 2, + 'ephemerality_filtered': 0.375, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.375, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 1.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 2, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 1 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[.5, .5], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 2, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 2, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 2 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[.5, .5], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 1 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[0.7, .3], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 2, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 2, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 2 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[0.7, .3], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 1 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[1., 0., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.6875, 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0.6875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.6875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[1., 0., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 1 / 6, 'ephemerality_original_span': 1, + 'ephemerality_filtered': 1 / 6, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 1 / 6, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 1.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 4, + 'ephemerality_filtered': 0.6875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.6875, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 1.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 4, + 'ephemerality_filtered': 1 / 6, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 1 / 6, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 1., 0., 1.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 4, + 'ephemerality_filtered': 0.0625, 'ephemerality_filtered_span': 3, + 'ephemerality_sorted': 0.375, 'ephemerality_sorted_span': 2 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 1., 0., 1.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 2, + 'ephemerality_filtered': 1 / 6, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 1 / 6, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[1., 1., 1., 1.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 4, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 4, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 4 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[1., 1., 1., 1.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 2, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 2, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 2 + }, + warnings=(True, True, True) + ), + EphemeralityTestCase( + input_vector=[1., 1., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.375, 'ephemerality_original_span': 2, + 'ephemerality_filtered': 0.375, 'ephemerality_filtered_span': 2, + 'ephemerality_sorted': 0.375, 'ephemerality_sorted_span': 2 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[1., 1., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 1 / 6, 'ephemerality_original_span': 1, + 'ephemerality_filtered': 1 / 6, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 1 / 6, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.875, 'ephemerality_original_span': 1, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 2 / 3, 'ephemerality_original_span': 1, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.375, 'ephemerality_original_span': 5, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 5, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.625, 'ephemerality_original_span': 3, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 3, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.5, 'ephemerality_original_span': 4, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 4, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 8, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 8, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 9, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 9, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 10, + 'ephemerality_filtered': 0.875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.875, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 10, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=[.1, .1, .1, .1, .1, .1, .1, .1, .1, .1], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 8, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 8, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 8 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[.1, .1, .1, .1, .1, .1, .1, .1, .1, .1], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 3, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 3, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 3 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., .2, .55, 0., .15, .1, 0., 0.], + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.125, 'ephemerality_original_span': 7, + 'ephemerality_filtered': 0.5, 'ephemerality_filtered_span': 4, + 'ephemerality_sorted': 0.625, 'ephemerality_sorted_span': 3 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=[0., 0., 0., .2, .55, 0., .15, .1, 0., 0.], + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 5, + 'ephemerality_filtered': 2 / 3, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2 / 3, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=np.eye(1, 10000, k=5000).flatten(), + threshold=0.8, + expected_output={ + 'ephemerality_original': 0.375, 'ephemerality_original_span': 5000, + 'ephemerality_filtered': 0.999875, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 0.999875, 'ephemerality_sorted_span': 1 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=np.eye(1, 10000, k=5000).flatten(), + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 5000, + 'ephemerality_filtered': 2999 / 3000, 'ephemerality_filtered_span': 1, + 'ephemerality_sorted': 2999 / 3000, 'ephemerality_sorted_span': 1 + }, + warnings=(True, False, False) + ), + EphemeralityTestCase( + input_vector=np.ones((10000,)), + threshold=0.8, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 8000, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 8000, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 8000 + }, + warnings=(False, False, False) + ), + EphemeralityTestCase( + input_vector=np.ones((10000,)), + threshold=0.3, + expected_output={ + 'ephemerality_original': 0., 'ephemerality_original_span': 3000, + 'ephemerality_filtered': 0., 'ephemerality_filtered_span': 3000, + 'ephemerality_sorted': 0., 'ephemerality_sorted_span': 3000 + }, + warnings=(False, False, False) + ) + ] + + def add_test_case(self, + input_vector: Sequence[float], + threshold: float, + expected_output: dict, + warnings: tuple[bool, bool, bool]): + self._test_cases.append(EphemeralityTestCase( + input_vector=input_vector, + threshold=threshold, + expected_output=expected_output, + warnings=warnings + )) + + def clear(self): + self._test_cases = list() + + @staticmethod + def round_ephemeralities(ephemeralities: dict, precision: int=8): + np.round_(ephemeralities['ephemerality_original'], precision) + np.round_(ephemeralities['ephemerality_filtered'], precision) + np.round_(ephemeralities['ephemerality_sorted'], precision) + + def test_compute_ephemeralities(self): + for i, test_case in enumerate(self._test_cases): + print(f'\nRunning test case {i}: {test_case.input_vector}, threshold {test_case.threshold}...') + with warnings.catch_warnings(record=True) as warns: + warnings.simplefilter('always', category=RuntimeWarning) + + actual_output = compute_ephemeralities(frequency_vector=test_case.input_vector, + threshold=test_case.threshold) + + self.assertEqual(self.round_ephemeralities(test_case.expected_output), + self.round_ephemeralities(actual_output)) + + warn_messages = "" + for warn in warns: + warn_messages += str(warn.message) + + actual_warnings = tuple((TestComputeEphemerality._warning_messages[i].search(warn_messages) is not None + for i in range(3))) + + self.assertEqual(test_case.warnings, actual_warnings)