ytsm/app/YtManagerApp/utils/subscription_file_parser.py

109 lines
2.9 KiB
Python
Raw Permalink Normal View History

from typing import Iterable, Optional
from xml.etree import ElementTree
import re
class FormatNotSupportedError(Exception):
pass
class SubFileParser(object):
def probe(self, file_handle) -> bool:
"""
Tests if file matches file format.
:param file_handle: File handle
:return: True if file matches, false otherwise
"""
return False
def parse(self, file_handle) -> Iterable[str]:
"""
Parses file and returns a list of subscription URLs.
:param file_handle:
:return:
"""
return []
class SubscriptionListFileParser(SubFileParser):
"""
A subscription list file is file which contains just a bunch of URLs.
Comments are supported using # character.
"""
def __is_url(self, text: str) -> bool:
return text.startswith('http://') or text.startswith('https://')
def probe(self, file_handle):
file_handle.seek(0)
for line in file_handle:
if isinstance(line, bytes) or isinstance(line, bytearray):
line = line.decode()
# Trim comments and spaces
line = re.sub('(^|\s)#.*', '', line).strip()
if len(line) > 0:
return self.__is_url(line)
return False
def parse(self, file_handle):
file_handle.seek(0)
for line in file_handle:
if isinstance(line, bytes) or isinstance(line, bytearray):
line = line.decode()
# Trim comments and spaces
line = re.sub('(^|\s)#.*', '', line).strip()
if len(line) > 0:
yield line
class OPMLParser(SubFileParser):
"""
Parses OPML files (emitted by YouTube)
"""
def __init__(self):
self.__cached_file = None
self.__cached_tree: Optional[ElementTree.ElementTree] = None
def __parse(self, file_handle):
if file_handle == self.__cached_file:
return self.__cached_tree
file_handle.seek(0)
tree = ElementTree.parse(file_handle)
self.__cached_file = file_handle
self.__cached_tree = tree
return self.__cached_tree
def probe(self, file_handle):
try:
tree = self.__parse(file_handle)
except ElementTree.ParseError:
# Malformed XML
return False
return tree.getroot().tag.lower() == 'opml'
def parse(self, file_handle):
tree = self.__parse(file_handle)
root = tree.getroot()
for node in root.iter('outline'):
if 'xmlUrl' in node.keys():
yield node.get('xmlUrl')
PARSERS = (
OPMLParser(),
SubscriptionListFileParser()
)
def parse(file_handle) -> Iterable[str]:
for parser in PARSERS:
if parser.probe(file_handle):
return parser.parse(file_handle)
raise FormatNotSupportedError('This file cannot be parsed!')