From dc44ff7bcb20369ea7819cc1ed5d96604d63be17 Mon Sep 17 00:00:00 2001 From: Andy Date: Sun, 2 Feb 2025 13:35:46 +0000 Subject: [PATCH] Major refactor This is a major refactor of the code. We finally have some structure to it. Better WEBVTT handling, and previous PR improvements have been integrated or improved. --- .flake8 | 2 + README.md | 18 +- pyproject.toml | 24 +++ requirements.txt | Bin 0 -> 262 bytes subtotxt.py | 445 ++++++++++++++++++++++++++++++----------------- 5 files changed, 320 insertions(+), 169 deletions(-) create mode 100644 .flake8 create mode 100644 pyproject.toml create mode 100644 requirements.txt diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..79a16af --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 \ No newline at end of file diff --git a/README.md b/README.md index b8ba475..9e268a7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # subtotxt -Quickly convert a [SubRip](https://en.wikipedia.org/wiki/SubRip) .srt or [WEBVTT](https://en.wikipedia.org/wiki/WebVTT) .vtt subtitle file to plain text. Removes timestamps and .srt subtitle line numbers. +Quickly convert a [SubRip](https://en.wikipedia.org/wiki/SubRip) .srt or [WEBVTT](https://en.wikipedia.org/wiki/WebVTT) .vtt subtitle file to plain text. Removes timestamps and .srt/.vtt subtitle line numbers. This was a quick project thrown together for my girlfriend, she's still learning English and wanted to be able to read subtitles more like a transcript for some trickier language issues (and to understand the jokes in Friends by discussing them with me). With a spot of feature creep and some encoding detection needs, it evolved into being able to detect character encoding, along with being able to understand both .srt and .vtt formats to save some pre-processing work. @@ -10,7 +10,7 @@ or ```python C:\Python\subtotxt.py -f subtitle.vtt``` The script will check which format the subtitle file is (incase of incorrect file extensions), detect the character encoding used then write out a .txt file with the same name as your input. If the output file already exists it will ask for permission to delete and create a new one. ## Advanced Usage: -The script has six more arguments you can parse: +The script has more advanced arguments you can parse: - *--utf8* or *-8* Forces the output file to use [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoding. This may eliminate character encoding issues if you cannot view the output file. In practice, if you can read the contents of the input subtitle file successfully the output should work without the need to change the encoding. - *--pause* or *-p* @@ -20,26 +20,27 @@ Prints the output to the console while writing to the file, may help with debugg - *--copy* or *-c* Copies input to output without change, appends *-copy* to filename *e.g.: subtitle-copy.srt*, handy to use with *--utf8* to quickly change encoding. Might be useful if your video player app cannot understand your original subtitle file encoding. - *--overwrite* or *-o* -Skips asking ```Output file already exists, delete and make a new one? [y/n]``` and simply deletes the existing output file to create a new one. Ideal for batch processing. +Skips asking `Output file already exists, delete and make a new one? [y/n]` and simply deletes the existing output file to create a new one. Ideal for batch processing. - *--oneliners* or *-1* Writes all sentences in one line, even if the original file divides some sentences into many lines or subtitles. - *--help* or *-h* Shows above information. ## Required External Modules: - [Send2Trash](https://pypi.org/project/Send2Trash/) Python module to safely delete the old output file on both Win and \*nix based systems. -- ~~[cchardet](https://pypi.org/project/cchardet/) Python module to detect your subtitle file encoding~~ (Removed for v2.0 release due to issues with Python 3.10.x installs, still used in v1.0 and will work on Python 3.9.x installs). -- [charset_normalizer](https://github.com/Ousret/charset_normalizer) Python module to detect your subtitle file encoding (v2.0+ supports Python 3.9.x and 3.10.x). +- ~~[cchardet](https://pypi.org/project/cchardet/) Python module to detect your subtitle file encoding~~ (Removed for v2.0+ release due to issues with Python 3.10.x installs, still used in v1.0 and will work on Python 3.9.x installs). +- [charset_normalizer](https://github.com/Ousret/charset_normalizer) Python module to detect your subtitle file encoding (v2.0 and YYYY-MM-DD versions, supports Python 3.9.x and above). -If your system does not these installed, it will auto install them on first use. +If your system does not these installed, it will auto install them on first use (or if you install a new version of Python later). If you prefer you can install them either manually, or by using the `requirements.txt` ## Features: - Fast (aside from initial missing modules install on slow net connections) -- Input files character encoding formats are autodetected (if supported by [cchardet](https://pypi.org/project/cchardet/) [v1.0] or [charset_normalizer](https://github.com/Ousret/charset_normalizer) [v2.0+]) +- Input files character encoding formats are autodetected (if supported by [cchardet](https://pypi.org/project/cchardet/) [v1.0] or [charset_normalizer](https://github.com/Ousret/charset_normalizer) [v2.0+]). For most languages it should be fine, for Chinese and near neighbour languages it can be tricky, a subtitle may contain valid characters for Mandarin or Cantonese (or other dialects) and be in potentially the wrong encoding. This can result in some wonky detection but it should not affect the overall output. - Output files are wrote in the same encoding as the input or can be forced to UTF8 - Should be cross platform friendly thanks to PathLib and Send2Trash - Handles UNC style ```\\myserver\myshare\mysub.srt``` paths thanks to PathLib - Handles SRT to TXT or WEBVTT to TXT - Handles multi line subtitles and subtitle lines with just numbers (does not confuse them with SRT line numbers) -- WEBVTT: Removes 'WEBVTT', 'Kind: xxxx', 'Language: xxx' headers and Timestamps from output +- Strips formatting tags, and rogue `{\an8}` tags you sometimes find in poorly converted subtitles +- WEBVTT: Removes 'WEBVTT', headers, metadata, notes, styles and timestamps from output - SRT: Removes subtitle line #'s and Timestamps, will not work if first subtitle is not 1 or if duplicated line numbers are present (rare cases but possible), use [SubtitleEdit](https://github.com/SubtitleEdit/subtitleedit) to renumber lines for now if this happens. ## Examples: WEBVTT Input: @@ -154,6 +155,5 @@ Output: - Possibly handle more formats (.ssa Sub Station Alpha would be the other major one I could think of), for now you can use something like [SubtitleEdit](https://github.com/SubtitleEdit/subtitleedit) to convert most other formats to .srt or .vtt. If you have a format you would like to convert to txt, contact me or raise an issue to see if I can add support. - GUI option for simple drag and drop usage. - Figure out a checking method for misnumbered or duplicate numbered SRT line numbers. -- Handle stripping out SRT formatting tags for bold, italic etc... ## License: Released as CC0, use it how you wish. If you do use it elsewhere, please be awesome and tag me as the original author. 🙂 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..23650bc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[tool.black] +line-length = 120 +target-version = [ + 'py38', + 'py39', + 'py310', + 'py311', + 'py312', + 'py313', +] +exclude = ''' +/( + \.eggs + | \.git + | \.idea + | \.pytest_cache + | \.github + | _build + | build + | dist + | venv + | test/resources +)/ +''' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b18f0064bbcb6dac32cf445572b7eb41a1934669 GIT binary patch literal 262 zcmZvWI}XAy5Jaa&;tr8gL&0UJjR_Fp#Fjr2;qZaCk?1J)H#@UCpS4$~vmP21G|(3{ zT~rl`7jab50`BEBQ%79o6 w!~P^?v^OF%nf8bM|2*N)ZkdBwSDnygZWVkP?x_yM_EiTsk;%RV{Jgh6Umb!mJpcdz literal 0 HcmV?d00001 diff --git a/subtotxt.py b/subtotxt.py index ff88be9..7c2ec53 100644 --- a/subtotxt.py +++ b/subtotxt.py @@ -1,186 +1,311 @@ +# cSpell:disable # SRT or WEBVTT to plain Text -# Author: NebularNerd Version 2.1 (July 2024) +# Author: NebularNerd +# Version: 2025-01-31 # https://github.com/NebularNerd/subtotxt -# Import required packages import sys import os import argparse -import pkg_resources import subprocess import re from pathlib import Path + +def missing_modules_installer(required_modules): + import platform + + if float(platform.python_version().rsplit(".", 1)[0].strip()) < 3.12: # pkg_resources method + import pkg_resources + + installed = {pkg.key for pkg in pkg_resources.working_set} + if float(platform.python_version().rsplit(".", 1)[0].strip()) >= 3.12: # importlib.metadata method + import importlib.metadata + + distributions = importlib.metadata.distributions() + installed = set() + for dist in distributions: + installed.add(dist.metadata["Name"].lower()) + missing = required_modules - installed + if missing: + y = "" + for x in missing: + y += f"{x.lower()}, " + print(f"Installing missing modules\n{y[:-2]}\nplease wait a few moments.") + python = sys.executable + subprocess.check_call([python, "-m", "pip", "install", *missing], stdout=subprocess.DEVNULL) + print("Done, thanks for waiting") + + # Install send2trash and charset_normalizer if missing. -# See https://pypi.org/project/Send2Trash/ -# See https://github.com/Ousret/charset_normalizer -REQUIRED = { - 'send2trash','charset-normalizer' -} - -installed = {pkg.key for pkg in pkg_resources.working_set} -missing = REQUIRED - installed - -if missing: - print('Installing missing modules, please wait a few moments. This only happens once.') - python = sys.executable - subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL) - print('Done, thanks for waiting') - -from send2trash import send2trash -from charset_normalizer import from_path - -# Clear screen win/*nix friendly -def cls(): - os.system('cls' if os.name=='nt' else 'clear') -cls() - -# Setup argparse -parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,description='Quickly strip SRT or WEBVTT of subtitle numbers and timestamp, then save to plain text file \nVisit https://github.com/NebularNerd/subtotxt for more information') -parser.add_argument("--file", "-f", type=str, required=True, help='Path to .srt or .vtt file, enclose in quotes if path has spaces') -parser.add_argument("--utf8", "-8", default=False, action="store_true", required=False, help='Force output file to use UTF-8 instead of input encoding') -parser.add_argument("--pause", "-p", default=False, action="store_true", required=False, help='Pauses at sanity check info to allow viewing before continuing') -parser.add_argument("--screen", "-s", default=False, action="store_true", required=False, help='Prints the conversion to the console as well as the file') -parser.add_argument("--copy", "-c", default=False, action="store_true", required=False, help='Copies input to output without change, appends -copy to filename') -parser.add_argument("--overwrite", "-o", default=False, action="store_true", required=False, help='Skips asking for permission to overwrite, will auto-delete old file and create a new one') -parser.add_argument("--oneliners", "-1", default=False, action="store_true", required=False, help='Write all sentences in one line, even if the original divides it into many lines or subtitles.') -args = parser.parse_args() - -# Setup file wrangling stuff and sanity checks -ifile = Path(args.file) -ofile = ifile.with_suffix('.txt') -cfile = ifile.with_stem(f"{ifile.stem}-copy") -result = from_path(ifile).best() # charset_normalizer guess encoding -encoding = result.encoding -if result is not None and encoding == "utf_8" and result.bom: - encoding += "_sig" # adds sig for utf_8_sig/bom files -if result is not None and encoding == "utf_16" and result.bom: - encoding += "_sig" # adds sig for utf_16_sig/bom files -confidence = 1.0 - result.chaos # gives probability of match being correct - -#Do stuff -print('SUB to TXT 2.0\n') -print('Input file : \n',ifile) -if args.copy: - print('Output file : \n',cfile,'\n') - deleteme = cfile -else: - print('Output file : \n',ofile,'\n') - deleteme = ofile - print('Detected Character Encoding:',encoding) - print('Confidence of encoding : {:0.2f}%'.format(confidence*100)) -if args.utf8: - print('Output encoding forced to UTF-8') - encset="utf8" -else: - print('Output will use input encoding') - encset=encoding -print('\n\n') -answer = None -if args.pause: - while answer not in ("y","n"): - answer = input("Ready to start? [y/n]") - if answer == "y": - print('Starting...') - elif answer == "n": - print ("OK, bye for now...\n\n") - sys.exit() - else: - print("Please enter y or n.") - -# Check for old file -answer = None -if not args.overwrite: - if deleteme.is_file(): - while answer not in ("y","n"): - answer = input("Output file already exists, delete and make a new one? [y/n]") - if answer == "y": - send2trash(deleteme) - elif answer == "n": - print ("OK, bye for now...\n\n") - sys.exit() - else: - print("Please enter y or n.") - -# Test File Format (in case of extension error) and set flags -webvtt = 0 -srt = 0 -if not args.copy: - with open(ifile, 'r', encoding=encoding) as testsub: - for line in testsub: - if "WEBVTT" in line: - webvtt = 1 - elif line.strip('\n') == "1" and re.search("(.*:.*:.*-->.*:.*:.*)",next(testsub)): - srt = 1 - -# SRT format -if srt == 1: - with open(ifile, 'r', encoding=encoding) as original, open(ofile, 'w', encoding=encset) as new: - subnum = 1 - subnumstr = str(subnum) - for line in original: - if line.strip('\n') == subnumstr and re.search("(.*:.*:.*-->.*:.*:.*)",next(original)): - subnum = subnum+1 - subnumstr = str(subnum) - #Ignore SRT Subtitle # and Timecode lines - elif not line.strip('\n') == '': - if args.screen: print(line, end='') - if args.oneliners: - line = line.strip() - if line[-1] in [".", "?", "!", "…"]: - new.write(line + '\n') - else: - new.write(line + ' ') - else: - new.write(line) - -# WEBVTT format -if webvtt == 1: - with open(ifile, 'r', encoding=encoding) as original, open(ofile, 'w', encoding=encset) as new: - subnum = 1 - subnumstr = str(subnum) - prevline = '' - for line in original: - if "WEBVTT" in line or re.search("^Kind:.*$",line) or re.search("^Language:.*$",line) or re.search("(.*:.*:.*-->.*:.*:.*)",line): - line = '' - # skip empty lines - if not line.strip(' \n') == '': - # skip duplicate lines - if prevline == line: - continue - # remove embedded tags from line - line = re.sub('<.*?>', '', line) - prevline = line - if args.screen: print(line, end='') - if args.oneliners: - line = line.strip() - if line[-1] in [".", "?", "!", "…"]: - new.write(line + '\n') - else: - new.write(line + ' ') - else: - new.write(line) - -# Copy mode -if args.copy: - with open(ifile, 'r', encoding=encoding) as original, open(cfile, 'w', encoding=encset) as new: - for line in original: - if args.screen: print(line, end='') - new.write(line) - -print('\nFinished\n') +# https://pypi.org/project/Send2Trash/ +# https://github.com/Ousret/charset_normalizer + + +while True: + try: + from send2trash import send2trash + from charset_normalizer import from_path + + break + except ModuleNotFoundError: + missing_modules_installer({"send2trash", "charset-normalizer"}) + + +# 8888888b. 8888888888 8888888888 .d8888b. +# 888 "Y88b 888 888 d88P Y88b +# 888 888 888 888 Y88b. +# 888 888 8888888 8888888 "Y888b. +# 888 888 888 888 "Y88b. +# 888 888 888 888 "888 +# 888 .d88P 888 888 Y88b d88P +# 8888888P" 8888888888 888 "Y8888P" + + +class file_handler: + def __init__(self, i): + if i.is_file(): + self.i = i + self.o = i.with_suffix(".txt") + self.c = i.with_stem(f"{Path(i).stem}-copy") + print(f"Input file: {i}") + else: + raise Exception(f"File {i} not found.") + +class encoding: + def __init__(self, i): + self.res = from_path(i).best() # charset_normalizer guess encoding + self.enc = self.res.encoding + self.out = "utf_8" if args.utf8 else self.enc + if self.res is not None and self.enc == "utf_8" and self.res.bom: + self.enc += "_sig" # adds sig for utf_8_sig/bom files + print(f"Detected Character Encoding: {self.enc}") + print(f"Confidence of encoding: {int((1.0 - self.res.chaos) * 100)}%") + print("Output encoding forced to UTF-8" if args.utf8 else "Output will use input encoding") +class subtitle: + def __init__(self): + self.format = self.testsub() # Which subtitle format + self.text = "" # The output text + self.text_finished = "" # The output text after a final check + self.prev = "" # Previously read line, prevents duplicates + self.junk = self.junklist() + def testsub(self): + with open(file.i, "r", encoding=enc.enc) as ts: + for line in ts: + if "WEBVTT" in line: + return "vtt" + if line.strip("\n") == "1" and re.search("(.*:.*:.*-->.*:.*:.*)", next(ts)): + return "srt" + def junklist(self): + # This list will grow + # Escaping and r(raw) tag needed for special characters + return ["<.*?>", r"\{\\an8\}", r"^-\s", r"\[.*\]", r"\(.*\)", "^.*?:"] +def cls(): # Clear screen win/*nix friendly + os.system("cls" if os.name == "nt" else "clear") +def yn(yn): # Simple Y/N selector, use yn(text_for_choice) + while True: + print(f"{yn} [Y/N]") + choice = input().lower() + if choice in {"yes", "y"}: + return True + elif choice in {"no", "n"}: + return False + else: + print("Please respond with 'yes' or 'no'") +def arguments(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Quickly convert SRT or WEBVTT subtitles into plain text file.", + epilog="Visit https://github.com/NebularNerd/subtotxt for more information.", + ) + parser.add_argument( + "--file", "-f", type=str, required=True, help="Path to .srt or .vtt file, enclose in quotes if path has spaces" + ) + parser.add_argument( + "--utf8", + "-8", + default=False, + action="store_true", + required=False, + help="Force output file to use UTF-8 instead of input encoding", + ) + parser.add_argument( + "--pause", + "-p", + default=False, + action="store_true", + required=False, + help="Pauses at info step to allow viewing info before continuing", + ) + parser.add_argument( + "--screen", + "-s", + default=False, + action="store_true", + required=False, + help="Prints the output to the console", + ) + parser.add_argument( + "--copy", + "-c", + default=False, + action="store_true", + required=False, + help="Copies input to output without change, appends -copy to filename", + ) + parser.add_argument( + "--overwrite", + "-o", + default=False, + action="store_true", + required=False, + help="Skips asking for permission to overwrite, will auto-delete old file and create a new one", + ) + parser.add_argument( + "--oneliners", + "-1", + default=False, + action="store_true", + required=False, + help="Write all sentences in one line, even if the original divides it into many lines or subtitles.", + ) + return parser.parse_args() + + +def overwrite(f): + if f.is_file(): + if (not args.overwrite and yn("Output file already exists, delete and make a new one?")) or args.overwrite: + print("Overwriting old file") + send2trash(f) + else: + raise Exception("Output file already exists.") + + +def copy(): + overwrite(file.c) + with open(file.i, "r", encoding=enc.enc) as original, open(file.c, "w", encoding=enc.out) as new: + for line in original: + if args.screen: + print(line, end="") + new.write(line) + print(f"Output file: {file.c}") + + +def junk_strip(line): + # Based on PR#4 by eMPee584 + # Looping is terrible, but, a required evil it seems + for junk in sub.junk: + try: + line = re.sub(rf"{junk}", "", line) + except Exception: # Line may become blank if we remove Closed Captions + pass + return line + + +def process_line(line): + # Strip formatting junk from line + # We do this before checking for duplicates + line = junk_strip(line) + # Process line if it's not a duplicate of the previous one, or empty. + # Based on PR#4 by eMPee584 + line = line.strip() + if not line == sub.prev and line != "": + # One liners based on PR#2 by adam-sierakowski + if args.oneliners: + if line[-1] in [".", "?", "!", "…"]: + ln = f"{line}\n" + sub.text += ln + else: + ln = f"{line} " + sub.text += ln + else: + ln = f"{line}\n" + sub.text += ln + + if args.screen: + print(ln, end="") + sub.prev = ln + + +def do_srt(): + # SubRip subtitle file .srt + # https://en.wikipedia.org/wiki/SubRip + # Format has a line number followed by a timecode on the next line, then text. + with open(file.i, "r", encoding=enc.enc) as original: + subnum = 1 + for line in original: # Ignore SRT Subtitle # and Timecode lines + if line.strip("\n") == str(subnum) and re.search("(.*:.*:.*-->.*:.*:.*)", next(original)): + subnum += 1 + elif not line.strip("\n") == "": + process_line(line) + write_to_file() + + +def do_vtt(): + # WebVTT (Web Video Text Tracks) subtitle file .vtt + # https://en.wikipedia.org/wiki/WebVTT + # https://www.checksub.com/blog/guide-use-webvtt-subtitles-format + # This format has a few differing 'standards', you have: + # Metadata, notes, styles, timceodes with optional hours, and optional line numbers, + # almost none of which are actually used it seems. But we need to handle them + with open(file.i, "r", encoding=enc.enc) as original: + subnum = 1 + head = 1 # Try and skip over everything until we reach the subtitles. + for line in original: + # Line number and timecode format + if line.strip("\n") == str(subnum) and re.search("(.*:.*-->.*:.*)", next(original)): + subnum += 1 + head = 0 + # Timecode only format + elif re.search("(.*:.*-->.*:.*)", line): + head = 0 + elif not line.strip("\n") == "" and head == 0: + process_line(line) + write_to_file() + +def write_to_file(): + with open(file.o, "w", encoding=enc.out) as new: + # We check for junk again because it can gets split over two lines and we can't find it until now. + for line in sub.text.splitlines(): + sub.text_finished += f"{junk_strip(line)}\n" + new.write(sub.text_finished) +def do_work(): + overwrite(file.o) + if sub.format == "srt": + do_srt() + elif sub.format == "vtt": + do_vtt() + else: + raise Exception("Unable to determine Subtitle format.") +if __name__ == "__main__": + args = arguments() + cls() + try: + print(f"SUB to TXT v2025-01-31\n{'-' * 22}") + file = file_handler(Path(args.file)) + enc = encoding(file.i) + if args.pause and not yn("Ready to start?"): + raise Exception("User exited at pause before start") + if args.copy: + copy() + else: + sub = subtitle() + do_work() + print("\nFinished!\n") + except Exception as error: + print(f"Script execution stopped because:\n{error}")