enhance incar parsing of backslash and multi-line str, fix ! comment handle

DanielYang59 · DanielYang59 · commit bc0885b0b9d1 · 2025-10-16T15:02:34.000+02:00
fix most issues: multi-line str still doesn't work

almost there, one statement in comment should not be parsed

guess it's fine to strip multi-line string

I guess ;! is not a valid use case, ; is meant to connect to statements

oops, looks like comment would be parsed

fix test

first working version
diff --git a/src/pymatgen/io/vasp/inputs.py b/src/pymatgen/io/vasp/inputs.py
@@ -961,13 +961,39 @@ def from_str(cls, string: str) -> Self:
         Returns:
             Incar object
         """
+        string = "\n".join([ln.split("#", 1)[0].split("!", 1)[0].rstrip() for ln in string.splitlines()])
+
         params: dict[str, Any] = {}
-        for line in clean_lines(string.splitlines()):
-            for sline in line.split(";"):
-                if match := re.match(r"(\w+)\s*=\s*(.*)", sline.strip()):
-                    key: str = match[1].strip()
-                    val: str = match[2].strip()
-                    params[key] = cls.proc_val(key, val)
+
+        # Handle line continuations (\)
+        string = re.sub(r"\\\s*\n", " ", string)
+
+        # Regex pattern to find all valid "key = value" assignments at once
+        pattern = re.compile(
+            r"""
+            (?P<key>\w+)             # Key (e.g. ENCUT)
+            \s*=\s*                  # Equals sign and optional spaces
+            (?:                      # Non-capturing group for the value
+                "                    # Opening quote
+                (?P<qval>.*?)        # Capture everything inside (non-greedy)
+                [ \t]*"              # Allow trailing spaces/tabs before closing quote
+                |                    # OR
+                (?P<val>[^#!;\n]*)   # Unquoted value (stops before comment/separator)
+            )
+            """,
+            re.VERBOSE | re.DOTALL,
+        )
+
+        # Find all matches in the entire string
+        for match in pattern.finditer(string):
+            key = match.group("key")
+            val = match.group("qval") if match.group("qval") is not None else (match.group("val") or "").strip()
+
+            if not val:
+                continue
+
+            params[key] = cls.proc_val(key, val)
+
         return cls(params)
 
     @staticmethod
@@ -1038,7 +1064,7 @@ def proc_val(key: str, val: str) -> list | bool | float | int | str:
         )
         lower_str_keys = ("ML_MODE",)
         # String keywords to read "as is" (no case transformation, only stripped)
-        as_is_str_keys = ("SYSTEM",)
+        as_is_str_keys = ("SYSTEM", "WANNIER90_WIN")
 
         def smart_int_or_float_bool(str_: str) -> float | int | bool:
             """Determine whether a string represents an integer or a float."""
diff --git a/tests/io/vasp/test_inputs.py b/tests/io/vasp/test_inputs.py
@@ -886,104 +886,91 @@ def test_write(self):
         incar = Incar.from_file(tmp_file)
         assert incar == self.incar
 
-    def test_from_str_complex(self):
-        r"""Test of handling complex INCAR:
-        - Multiple statements on a single line separated by semicolon
-        - Comments marked by hashtag # or exclamation mark !
-        - Ignore lines does not fit (tag = values) statement format
-        - Long lines split by backslash \
-        - Multi-line strings (comment would not be ignored), e.g. WANNIER90_WIN
-
-        TODO:
-            - test line-ending char independence
-            - test cast casting for multi-line string (auto-capitalization?)
-        """
+    def test_from_str_comment_handling(self):
         incar_str = r"""
-        # Test comment handling (especially for string tags)
-        SIGMA = 0.05  # random comment (known float tag)
-        EDIFF = 1e-6  ! another comment (known float tag)
-        ALGO = Normal # comment (unknown tag -> inferred as str)
-        GGA = PE ! comment (unknown tag -> inferred as str)
-
-        # Test interaction between semicolon and comment
-        ENCUT = 520; ISMEAR = 0  # smearing scheme
-        PREC = Accurate ; LREAL = Auto  ! precision and projection scheme
-        NELM = 60; ! ENCUT = 200  # should not parse second assignment
-        ENMIN = 100; # ENCUT = 200  # should not parse second assignment
-
-        # Line continuation with backslash (backslash in comment)
-        ENMAX = 200 ! \
-        IBRION = 0 # \
-        MAGMOM  = 0 0 1.0 0 0 -1.0 \
-            0 0 1.0 0 0 -1.0 \
-            6*0
-
-        # Multi-line string with embedded comments
-        WANNIER90_WIN = "Begin Projections
-        Fe:d ; Fe:p  # comment inside string
-        End Projections  ! random comment
-        "
+        # A = 0
+        ! B=1
+        SIGMA = 0.05   # random comment (known float tag)
+        EDIFF = 1e-6   ! another comment (known float tag)
+        ALGO = Normal  # comment (unknown tag -> inferred as str)
+        GGA = PE       ! comment (unknown tag -> inferred as str)
+        """
+        incar = Incar.from_str(incar_str)
 
-        # Test valid statement (tag = values) in comment
-        ! invalid ENCUT = 100
-        # still invalid ENCUT = 200
+        assert set(incar.keys()) == {"SIGMA", "EDIFF", "ALGO", "GGA"}
+        assert incar["SIGMA"] == approx(0.05)
+        assert incar["EDIFF"] == approx(1e-6)
+        assert incar["ALGO"] == "Normal"
+        assert incar["GGA"] == "Pe"
 
-        # Test invalid statement (tag = values)
-        Not a valid statement
-        ENCUT 300
+    def test_from_str_semicolon_separated_statements(self):
+        # Test interaction between semicolon and comment
+        incar_str = r"""
+        ENMAX = 400; ALGO = Fast         ! A = 0
+        ENCUT = 500; ISMEAR = 0          # B=1
+        PREC = Accurate ; LREAL = Auto   ! precision and projection scheme
+        IBRION = 2; ISIF = 3; NSW = 100  # three statements in one line
         """
-
         incar = Incar.from_str(incar_str)
 
-        expected_keys = {
+        assert set(incar.keys()) == {
+            "ENMAX",
+            "ALGO",
             "ENCUT",
             "ISMEAR",
             "PREC",
             "LREAL",
-            "NELM",
-            "ENMIN",
-            "ENMAX",
             "IBRION",
-            "ALGO",
-            "GGA",
-            "SIGMA",
-            "EDIFF",
-            "MAGMOM",
-            "WANNIER90_WIN",
+            "ISIF",
+            "NSW",
         }
-        assert set(incar.keys()) == expected_keys
 
-        # Comment handling
-        assert incar["SIGMA"] == approx(0.05)
-        assert incar["EDIFF"] == approx(1e-6)
-        assert incar["ALGO"] == "Normal"
-        assert incar["GGA"] == "Pe"
-
-        # Line with both ; and comment
-        assert incar["ENCUT"] == 520
+        assert incar["ENMAX"] == 400
+        assert incar["ALGO"] == "Fast"
+        assert incar["ENCUT"] == 500
         assert incar["ISMEAR"] == 0
-        assert incar["NELM"] == 60
-        assert incar["ENMIN"] == 100
+        assert incar["PREC"] == "Accurate"
+        assert incar["LREAL"] == "Auto"
+        assert incar["IBRION"] == 2
+        assert incar["ISIF"] == 3
+        assert incar["NSW"] == 100
+
+    def test_from_str_line_continuation_with_backslash(self):
+        # Test line continuation with backslash
+        incar_str = r"""
+        ALGO = Normal  # \ This backslash should be ignored
+        ENMAX = 200    ! \ This backslash should be ignored
+        MAGMOM  = 0 0 1.0 0 0 -1.0 \
+                0 0 1.0 0 0 -1.0 \
+                6*0
+        """
+        incar = Incar.from_str(incar_str)
+
+        assert set(incar.keys()) == {"ALGO", "ENMAX", "MAGMOM"}
+        assert incar["ALGO"] == "Normal"
         assert incar["ENMAX"] == 200
-        assert incar["IBRION"] == 0
-        assert incar["PREC"].lower() == "accurate"
-        assert incar["LREAL"].lower() == "auto"
 
-        # Continuation merged properly
-        magmom = incar["MAGMOM"]
-        assert magmom == [0, 0, 1.0, 0, 0, -1.0, 0, 0, 1.0, 0, 0, -1.0] + [0.0] * 6
+        assert incar["MAGMOM"] == [0, 0, 1.0, 0, 0, -1.0, 0, 0, 1.0, 0, 0, -1.0] + [0.0] * 6
 
-        # Multi-line string with comment
-        win = incar["WANNIER90_WIN"]
-        expected_win = "Begin Projections\nFe:d ; Fe:p  # comment inside string\nEnd Projections  ! random comment\n"
-        # Comments and structure inside string should be preserved exactly
-        assert win.strip() == expected_win.strip()
+    def test_from_str_multiline_string(self):
+        incar_str = r"""
+        # Multi-line string with embedded comments
+        WANNIER90_WIN = "begin Projections  # should NOT be capitalized
+        Fe:d ; Fe:p  # comment inside string
+        End Projections  ! random comment
+        "  # comment after closing quote
+        """
+        incar = Incar.from_str(incar_str)
 
-    def test_from_str_not_closed_multi_line_str(self):
-        """Test not closed (no ending quote) multi-line string.
+        assert set(incar.keys()) == {"WANNIER90_WIN"}
 
-        TODO:
-        """
+        # Comments inside the string would be lost
+        assert (
+            incar["WANNIER90_WIN"]
+            == """begin Projections
+        Fe:d ; Fe:p
+        End Projections"""
+        )
 
     def test_get_str(self):
         incar_str = self.incar.get_str(pretty=True, sort_keys=True)