apache · gaogaotiantian · Nov 27, 2025 · Nov 28, 2025 · zhengruifeng · Nov 28, 2025
diff --git a/dev/tox.ini b/dev/tox.ini
@@ -18,9 +18,6 @@
 ignore =
     E203, # Skip as black formatter adds a whitespace around ':'.
     E402, # Module top level import is disabled for optional import check, etc.
-    # 1. Type hints with def are treated as redefinition (e.g., functions.log).
-    # 2. Some are used for testing.
-    F811,
     # There are too many instances to fix. Ignored for now.
     W503,
     W504,

diff --git a/python/pyspark/logger/tests/test_logger.py b/python/pyspark/logger/tests/test_logger.py
@@ -215,7 +215,6 @@ class LoggerTests(LoggerTestsMixin, ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.logger.tests.test_logger import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
@@ -50,7 +50,6 @@
     from py4j.java_gateway import JavaGateway, JavaObject
     from pyspark.ml._typing import PipelineStage
     from pyspark.ml.base import Params
-    from pyspark.ml.wrapper import JavaWrapper
     from pyspark.core.context import SparkContext
     from pyspark.sql import DataFrame
     from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
@@ -34,7 +34,6 @@
 )
 from pyspark.mllib.util import Saveable, Loader, inherit_doc
 from pyspark.mllib.linalg import Vector
-from pyspark.mllib.regression import LabeledPoint
 
 if TYPE_CHECKING:
     from pyspark.mllib._typing import VectorLike

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
@@ -36,7 +36,6 @@
 
 if TYPE_CHECKING:
     from pyspark.mllib._typing import VectorLike
-    from py4j.java_collections import JavaMap
 
 __all__ = [
     "Normalizer",

diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
@@ -32,7 +32,7 @@
 
 import numpy as np
 
-from pyspark import RDD, since
+from pyspark import since
 from pyspark.streaming.dstream import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
@@ -18,7 +18,7 @@
 import sys
 import random
 
-from pyspark import RDD, since
+from pyspark import since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint

diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
@@ -20,15 +20,14 @@
 
 import numpy as np
 
-from pyspark import SparkContext, since
+from pyspark import since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc
 from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
 from pyspark.sql import DataFrame
 from typing import Generic, Iterable, List, Optional, Tuple, Type, TypeVar, cast, TYPE_CHECKING
 from pyspark.core.context import SparkContext
 from pyspark.mllib.linalg import Vector
 from pyspark.core.rdd import RDD
-from pyspark.sql.dataframe import DataFrame
 
 T = TypeVar("T")
 L = TypeVar("L", bound="Loader")

diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
@@ -124,7 +124,6 @@ class DiffFramesCorrWithTests(
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.diff_frames_ops.test_corrwith import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py b/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py
@@ -91,7 +91,6 @@ class DiffFramesDotFrameTests(DiffFramesDotFrameMixin, PandasOnSparkTestCase, SQ
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.diff_frames_ops.test_dot_frame import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -402,7 +402,6 @@ class CategoricalIndexTests(CategoricalIndexTestsMixin, PandasOnSparkTestCase, T
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.indexes.test_category import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -115,7 +115,6 @@ class TimedeltaIndexTests(
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.indexes.test_timedelta import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -226,7 +226,6 @@ class SeriesStringOpsAdvTests(
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.series.test_string_ops_adv import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py
@@ -683,7 +683,6 @@ class NamespaceTests(NamespaceTestsMixin, PandasOnSparkTestCase, SQLTestUtils):
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.test_namespace import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/test_numpy_compat.py b/python/pyspark/pandas/tests/test_numpy_compat.py
@@ -198,7 +198,6 @@ class NumPyCompatTests(
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.pandas.tests.test_numpy_compat import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/pandas/tests/test_utils.py b/python/pyspark/pandas/tests/test_utils.py
@@ -181,7 +181,7 @@ def test_dataframe_error_assert_pandas_almost_equal(self):
             },
         )
 
-    def test_series_error_assert_pandas_equal(self):
+    def test_series_error_assert_pandas_almost_equal_2(self):
         series1 = pd.Series([1, 2, 3])
         series2 = pd.Series([4, 5, 6])
 

diff --git a/python/pyspark/pipelines/tests/test_block_connect_access.py b/python/pyspark/pipelines/tests/test_block_connect_access.py
@@ -17,7 +17,6 @@
 import unittest
 
 from pyspark.errors import PySparkException
-from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.connectutils import (
     ReusedConnectTestCase,
     should_test_connect,

diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -19,7 +19,6 @@
 from pyspark.errors.exceptions.base import (
     SessionNotSameException,
     PySparkIndexError,
-    PySparkAttributeError,
 )
 from pyspark.resource import ResourceProfile
 from pyspark.sql.connect.logging import logger

diff --git a/python/pyspark/sql/connect/group.py b/python/pyspark/sql/connect/group.py
@@ -55,7 +55,6 @@
         PandasGroupedMapFunctionWithState,
     )
     from pyspark.sql.connect.dataframe import DataFrame
-    from pyspark.sql.types import StructType
 
 
 class GroupedData:

diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py
@@ -50,7 +50,6 @@
         UserDefinedFunctionLike,
     )
     from pyspark.sql.connect.session import SparkSession
-    from pyspark.sql.types import StringType
 
 
 def _create_py_udf(

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -3061,56 +3061,6 @@ def floor(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Co
         return _invoke_function_over_columns("floor", col, scale)  # type: ignore[arg-type]
 
 
-@_try_remote_functions
-def log(col: "ColumnOrName") -> Column:
-    """
-    Computes the natural logarithm of the given value.
-
-    .. versionadded:: 1.4.0
-
-    .. versionchanged:: 3.4.0
-        Supports Spark Connect.
-
-    Parameters
-    ----------
-    col : :class:`~pyspark.sql.Column` or column name
-        column to calculate natural logarithm for.
-
-    Returns
-    -------
-    :class:`~pyspark.sql.Column`
-        natural logarithm of the given value.
-
-    Examples
-    --------
-    Example 1: Compute the natural logarithm of E
-
-    >>> from pyspark.sql import functions as sf
-    >>> spark.range(1).select(sf.log(sf.e())).show()
-    +-------+
-    |ln(E())|
-    +-------+
-    |    1.0|
-    +-------+
-
-    Example 2: Compute the natural logarithm of invalid values
-
-    >>> from pyspark.sql import functions as sf
-    >>> spark.sql(
-    ...     "SELECT * FROM VALUES (-1), (0), (FLOAT('NAN')), (NULL) AS TAB(value)"
-    ... ).select("*", sf.log("value")).show()
-    +-----+---------+
-    |value|ln(value)|
-    +-----+---------+
-    | -1.0|     NULL|
-    |  0.0|     NULL|
-    |  NaN|      NaN|
-    | NULL|     NULL|
-    +-----+---------+
-    """
-    return _invoke_function_over_columns("log", col)
-
-
 @_try_remote_functions
 def log10(col: "ColumnOrName") -> Column:
     """
@@ -8342,7 +8292,7 @@ def when(condition: Column, value: Any) -> Column:
     return _invoke_function("when", condition._jc, v)
 
 
-@overload  # type: ignore[no-redef]
+@overload
 def log(arg1: "ColumnOrName") -> Column:
     ...
 

diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py
@@ -61,7 +61,6 @@
 from pyspark.errors import ArithmeticException, PySparkTypeError, UnsupportedOperationException
 from pyspark.loose_version import LooseVersion
 from pyspark.util import is_remote_only
-from pyspark.loose_version import LooseVersion
 
 if have_pandas:
     import pandas as pd
@@ -1009,11 +1008,6 @@ def check_createDataFrame_pandas_with_struct_type(self, arrow_enabled):
                             expected[r][e] == result[r][e], f"{expected[r][e]} == {result[r][e]}"
                         )
 
-    def test_createDataFrame_pandas_with_struct_type(self):
-        for arrow_enabled in [True, False]:
-            with self.subTest(arrow_enabled=arrow_enabled):
-                self.check_createDataFrame_pandas_with_struct_type(arrow_enabled)
-
     def test_createDataFrame_arrow_with_struct_type_nulls(self):
         t = pa.table(
             {

diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf.py b/python/pyspark/sql/tests/arrow/test_arrow_udf.py
@@ -164,13 +164,13 @@ def test_arrow_udf_wrong_arg(self):
             with self.assertRaises(ParseException):
 
                 @arrow_udf("blah")
-                def foo(x):
+                def _(x):
                     return x
 
             with self.assertRaises(PySparkTypeError) as pe:
 
                 @arrow_udf(returnType="double", functionType=PandasUDFType.SCALAR)
-                def foo(df):
+                def _(df):
                     return df
 
             self.check_error(
@@ -185,7 +185,7 @@ def foo(df):
             with self.assertRaises(PySparkTypeError) as pe:
 
                 @arrow_udf(functionType=ArrowUDFType.SCALAR)
-                def foo(x):
+                def _(x):
                     return x
 
             self.check_error(
@@ -197,7 +197,7 @@ def foo(x):
             with self.assertRaises(PySparkTypeError) as pe:
 
                 @arrow_udf("double", 100)
-                def foo(x):
+                def _(x):
                     return x
 
             self.check_error(
@@ -209,7 +209,7 @@ def foo(x):
             with self.assertRaises(PySparkTypeError) as pe:
 
                 @arrow_udf(returnType=PandasUDFType.GROUPED_MAP)
-                def foo(df):
+                def _(df):
                     return df
 
             self.check_error(
@@ -224,13 +224,13 @@ def foo(df):
             with self.assertRaisesRegex(ValueError, "0-arg arrow_udfs.*not.*supported"):
 
                 @arrow_udf(LongType(), ArrowUDFType.SCALAR)
-                def zero_with_type():
+                def _():
                     return 1
 
             with self.assertRaisesRegex(ValueError, "0-arg arrow_udfs.*not.*supported"):
 
                 @arrow_udf(LongType(), ArrowUDFType.SCALAR_ITER)
-                def zero_with_type():
+                def _():
                     yield 1
                     yield 2
 

diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py b/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py
@@ -27,7 +27,7 @@
 from pyspark.util import PythonEvalType
 
 from pyspark.sql.functions import arrow_udf, ArrowUDFType
-from pyspark.sql import Row, functions as F
+from pyspark.sql import functions as F
 from pyspark.sql.types import (
     IntegerType,
     ByteType,

diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py
@@ -313,7 +313,6 @@ def test_server_listener_uninterruptible(self):
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.sql.tests.connect.streaming.test_parity_listener import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py
@@ -26,7 +26,6 @@ class DataFrameQueryContextParityTests(DataFrameQueryContextTestsMixin, ReusedCo
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.sql.tests.connect.test_parity_dataframe_query_context import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/sql/tests/connect/test_parity_geographytype.py b/python/pyspark/sql/tests/connect/test_parity_geographytype.py
@@ -26,7 +26,6 @@ class GeographyTypeParityTest(GeographyTypeTestMixin, ReusedConnectTestCase):
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.sql.tests.connect.test_parity_geographytype import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/sql/tests/connect/test_parity_geometrytype.py b/python/pyspark/sql/tests/connect/test_parity_geometrytype.py
@@ -26,7 +26,6 @@ class GeometryTypeParityTest(GeometryTypeTestMixin, ReusedConnectTestCase):
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.sql.tests.connect.test_parity_geometrytype import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/sql/tests/connect/test_parity_observation.py b/python/pyspark/sql/tests/connect/test_parity_observation.py
@@ -29,7 +29,6 @@ class DataFrameObservationParityTests(
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.sql.tests.connect.test_parity_observation import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/sql/tests/connect/test_parity_readwriter.py b/python/pyspark/sql/tests/connect/test_parity_readwriter.py
@@ -37,7 +37,6 @@ def test_partitioning_functions(self):
 
 
 if __name__ == "__main__":
-    import unittest
     from pyspark.sql.tests.connect.test_parity_readwriter import *  # noqa: F401
 
     try:

diff --git a/python/pyspark/sql/tests/connect/test_parity_udf.py b/python/pyspark/sql/tests/connect/test_parity_udf.py
@@ -56,10 +56,6 @@ def test_udf_defers_judf_initialization(self):
     def test_nondeterministic_udf3(self):
         super().test_nondeterministic_udf3()
 
-    @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.")
-    def test_worker_original_stdin_closed(self):
-        super().test_worker_original_stdin_closed()
-
     @unittest.skip("Spark Connect does not support SQLContext but the test depends on it.")
     def test_udf_on_sql_context(self):
         super().test_udf_on_sql_context()

diff --git a/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py b/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py
@@ -26,7 +26,6 @@
     ReusedSQLTestCase,
 )
 
-from pyspark.testing.sqlutils import ReusedSQLTestCase
 from pyspark.sql.tests.pandas.streaming.test_pandas_transform_with_state_state_variable import (
     TransformWithStateStateVariableTestsMixin,
 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -215,7 +215,6 @@ class LoggerTests(LoggerTestsMixin, ReusedSQLTestCase): @@
     if __name__ == "__main__":
-        import unittest
         from pyspark.logger.tests.test_logger import *  # noqa: F401
         try:
@@ Expand Down @@