Netflix-videos-downloader/helpers/dfxp_to_srt.py

import codecs
import math
import os
import re


class dfxp_to_srt:
    def __init__(self):
        self.__replace__ = "empty_line"

    def leading_zeros(self, value, digits=2):
        value = "000000" + str(value)
        return value[-digits:]

    def convert_time(self, raw_time):
        if int(raw_time) == 0:
            return "{}:{}:{},{}".format(0, 0, 0, 0)

        ms = "000"
        if len(raw_time) > 4:
            ms = self.leading_zeros(int(raw_time[:-4]) % 1000, 3)
        time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0
        second = self.leading_zeros(time_in_seconds % 60)
        minute = self.leading_zeros(int(math.floor(time_in_seconds / 60)) % 60)
        hour = self.leading_zeros(int(math.floor(time_in_seconds / 3600)))
        return "{}:{}:{},{}".format(hour, minute, second, ms)

    def xml_id_display_align_before(self, text):

        align_before_re = re.compile(
            u'<region.*tts:displayAlign="before".*xml:id="(.*)"/>'
        )
        has_align_before = re.search(align_before_re, text)
        if has_align_before:
            return has_align_before.group(1)
        return u""

    def xml_to_srt(self, text):
        def append_subs(start, end, prev_content, format_time):
            subs.append(
                {
                    "start_time": self.convert_time(start) if format_time else start,
                    "end_time": self.convert_time(end) if format_time else end,
                    "content": u"\n".join(prev_content),
                }
            )

        display_align_before = self.xml_id_display_align_before(text)
        begin_re = re.compile(u"\s*<p begin=")
        sub_lines = (l for l in text.split("\n") if re.search(begin_re, l))
        subs = []
        prev_time = {"start": 0, "end": 0}
        prev_content = []
        start = end = ""
        start_re = re.compile(u'begin\="([0-9:\.]*)')
        end_re = re.compile(u'end\="([0-9:\.]*)')
        content_re = re.compile(u'">(.*)</p>')

        # span tags are only used for italics, so we'll get rid of them
        # and replace them by <i> and </i>, which is the standard for .srt files
        span_start_re = re.compile(u'(<span style="[a-zA-Z0-9_.]+">)+')
        span_end_re = re.compile(u"(</span>)+")
        br_re = re.compile(u"(<br\s*\/?>)+")
        fmt_t = True
        for s in sub_lines:
            span_start_tags = re.search(span_start_re, s)
            if span_start_tags:
                s = u"<i>".join(s.split(span_start_tags.group()))
            string_region_re = (
                r'<p(.*region="' + display_align_before + r'".*")>(.*)</p>'
            )
            s = re.sub(string_region_re, r"<p\1>{\\an8}\2</p>", s)
            content = re.search(content_re, s).group(1)

            br_tags = re.search(br_re, content)
            if br_tags:
                content = u"\n".join(content.split(br_tags.group()))

            span_end_tags = re.search(span_end_re, content)
            if span_end_tags:
                content = u"</i>".join(content.split(span_end_tags.group()))

            prev_start = prev_time["start"]
            start = re.search(start_re, s).group(1)
            end = re.search(end_re, s).group(1)
            if len(start.split(":")) > 1:
                fmt_t = False
                start = start.replace(".", ",")
                end = end.replace(".", ",")
            if (prev_start == start and prev_time["end"] == end) or not prev_start:
                # Fix for multiple lines starting at the same time
                prev_time = {"start": start, "end": end}
                prev_content.append(content)
                continue
            append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t)
            prev_time = {"start": start, "end": end}
            prev_content = [content]
        append_subs(start, end, prev_content, fmt_t)

        lines = (
            u"{}\n{} --> {}\n{}\n".format(
                s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"]
            )
            for s in range(len(subs))
        )
        return u"\n".join(lines)

    def convert(self, Input, Output):

        with codecs.open(Input, "rb", "utf-8") as f:
            text = f.read()

        with codecs.open(Output, "wb", "utf-8") as f:
            f.write(self.xml_to_srt(text))

        return