ref: Move to OO implementation

Heavily inspired by the `youtube-dl` implementation I moved to OO implementation where now every source type has its own class, making easy to add new providers. Also new it has a fallback back, where if no provider is chose it will try to download with `YoutubeDlProvideBase`. Add `_TEST` to each class to make it easy to add test to new providers.
author: gabrielgio <gabriel.giovanini@pm.me> 2021-07-18 19:56:59 +0200
committer: gabrielgio <gabriel.giovanini@pm.me> 2021-07-18 19:56:59 +0200
commit: 10cbc378ad0daf0e80f5ceed92d70fdbf573df88 (patch)
tree: a4217e75f591632ed383e334ed8e61935cd2b096
parent: b453f05d18c261d3ce3b20bb5aaa2504da562756 (diff)
download: reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.tar.gz
reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.tar.bz2
reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.zip
15 files changed, 212 insertions, 200 deletions
diff --git a/Dockerfile b/Dockerfile
index a3b227c..1901d51 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,8 +7,7 @@ WORKDIR /opt
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 
-COPY importer/downloader.py .
-COPY importer/uploader.py .
+COPY importer/ .
 COPY main.py .
 
 
diff --git a/importer/downloader.py b/importer/downloader.py
index c67834b..1dd1684 100644
--- a/importer/downloader.py
+++ b/importer/downloader.py
@@ -1,157 +1,22 @@
-import os
 import re
-import shutil
-from enum import Enum
-from urllib.parse import urlparse
-
-import youtube_dl
-import requests
 
 from praw import Reddit
 
-
-class SourceType(Enum):
-    VREDDIT = 1
-    IREDDIT = 2
-    YOUTUBE = 4
-    REDGIFS = 5
-    IMAGURJPG = 6
-    GFYCAT = 7
-    GREDDIT = 8
-    UNKNOWN = 1000
-
-
-OUTTMPL = 'source_%(id)s.%(ext)s'
+from importer.providers import GReddit, Gfycat, IReddit, Imgur, RedGifs, VReddit, YoutubeDlProviderBase, \
+    RawImageProviderBase, Youtube
 
 
 class Downloader:
-    reddit: Reddit
-    username: str
-    downloaded: bool
-    post_id: str
-    source_type: SourceType
-    paths: list[str]
+    providers = [GReddit, Gfycat, IReddit, Imgur, RedGifs, VReddit, Youtube, RawImageProviderBase,
+                 YoutubeDlProviderBase]
 
     def __init__(self, url: str, reddit: Reddit):
-        self.reddit = reddit
-        self.downloaded = False
+        self.Provider = next(filter(lambda x: re.match(x.regex, url), self.providers))
         self.url = url
-        self.source_type = self._get_source_type(url)
-        self.paths = []
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.delete()
+        self.reddit = reddit
 
     def download(self):
-        try:
-            if self.source_type == SourceType.VREDDIT:
-                self._download_vreddit()
-            elif self.source_type == SourceType.REDGIFS:
-                self._download_redgifs()
-            elif self.source_type == SourceType.GFYCAT:
-                self._download_gifycat()
-            elif self.source_type == SourceType.YOUTUBE:
-                self._download_youtube()
-            elif self.source_type in (SourceType.IMAGURJPG, SourceType.IREDDIT):
-                self._download_raw_file()
-            elif self.source_type == SourceType.GREDDIT:
-                self._download_gallery_reddit()
-        except Exception as e:
-            self.downloaded = False
-
-    def delete(self):
-        if self.paths:
-            for path in self.paths:
-                if os.path.exists(path):
-                    os.unlink(path)
-
-    def _download_youtube_dls(self, ydl_opts):
-        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-            info = ydl.extract_info(self.url, download=True)
-            if info.get('_type', None) == 'playlist':
-                for entry in info['entries']:
-                    r = ydl.prepare_filename(entry)
-                    self.paths.append(f'{os.path.splitext(r)[0]}.mp4')
-            else:
-                r = ydl.prepare_filename(info)
-                self.paths.append(f'{os.path.splitext(r)[0]}.mp4')
-
-            self.downloaded = True
-
-    def _download_redgifs(self):
-        ydl_opts = {
-            'format': 'best',
-            'merge_output_format': 'mp4',
-            'outtmpl': OUTTMPL
-        }
-        self._download_youtube_dls(ydl_opts)
-
-    def _download_gifycat(self):
-        ydl_opts = {
-            'format': 'best',
-            'merge_output_format': 'mp4',
-            'outtmpl': OUTTMPL
-        }
-        self._download_youtube_dls(ydl_opts)
-
-    def _download_vreddit(self):
-        ydl_opts = {
-            'format': 'bestvideo+bestaudio/bestvideo',
-            'merge_output_format': 'mp4',
-            'outtmpl': OUTTMPL
-        }
-        self._download_youtube_dls(ydl_opts)
-
-    def _download_youtube(self):
-        ydl_opts = {
-            'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio',
-            'merge_output_format': 'mp4',
-            'outtmpl': OUTTMPL
-        }
-        self._download_youtube_dls(ydl_opts)
-
-    def _download_raw_file(self):
-        a = urlparse(self.url)
-        path = f'source_{os.path.basename(a.path)}'
-
-        r = requests.get(self.url, stream=True)
-        if r.status_code == 200:
-            self.downloaded = True
-            with open(path, 'wb') as f:
-                r.raw.decode_content = True
-                shutil.copyfileobj(r.raw, f)
-            self.paths.append(path)
-        else:
-            self.downloaded = False
-
-    def _download_gallery_reddit(self):
-        url = self.url
-        submission = self.reddit.submission(url=self.url)
-        for key in submission.media_metadata:
-            value = submission.media_metadata[key]
-            self.url = value['s']['u']
-            self._download_raw_file()
-
-        self.url = url
-
-    @staticmethod
-    def _get_source_type(url):
-        if re.match("^.*v\\.redd\\.it.*$", url):
-            return SourceType.VREDDIT
-        if re.match("^.*i\\.redd\\.it.*\\.(jpg|jpeg)$", url):
-            return SourceType.IREDDIT
-        if re.match("^.*\\.youtube\\.com.*$", url):
-            return SourceType.YOUTUBE
-        if re.match("^.*redgifs\\.com.*$", url):
-            return SourceType.REDGIFS
-        if re.match("^.*i\\.imgur\\.com.*\\.(jpg|jpeg)$", url):
-            return SourceType.IMAGURJPG
-        if re.match("^.*gfycat.com.*$", url):
-            return SourceType.GFYCAT
-        if re.match("^.*www.reddit.com/gallery.*$", url):
-            return SourceType.GREDDIT
-
-        return SourceType.UNKNOWN
+        with self.Provider(url=self.url, reddit=self.reddit) as provider:
+            provider.download()
+            self.paths = provider.paths
+            self.downloaded = provider.downloaded
diff --git a/importer/providers/__init__.py b/importer/providers/__init__.py
new file mode 100644
index 0000000..62c2d85
--- /dev/null
+++ b/importer/providers/__init__.py
@@ -0,0 +1,10 @@
+from .g_reddit import GReddit
+from .providerbase import ProviderBase
+from .gfycat import Gfycat
+from .i_reddit import IReddit
+from .imgur import Imgur
+from .raw_image_base import RawImageProviderBase
+from .redgifs import RedGifs
+from .v_reddit import VReddit
+from .youtube import Youtube
+from .youtube_dl_base import YoutubeDlProviderBase
diff --git a/importer/providers/g_reddit.py b/importer/providers/g_reddit.py
new file mode 100644
index 0000000..53ee5df
--- /dev/null
+++ b/importer/providers/g_reddit.py
@@ -0,0 +1,19 @@
+from praw import Reddit
+
+from importer.providers.raw_image_base import RawImageProviderBase
+
+
+class GReddit(RawImageProviderBase):
+    regex = "^.*www.reddit.com/gallery.*$"
+
+    def __init__(self, url: str, reddit: Reddit):
+        super(GReddit, self).__init__(url)
+        self.reddit = reddit
+
+    def download(self):
+        submission = self.reddit.submission(url=self.url)
+        for key in submission.media_metadata:
+            value = submission.media_metadata[key]
+            url = value['s']['u']
+            path = self._download_raw_file(url)
+            self.paths.append(path)
diff --git a/importer/providers/gfycat.py b/importer/providers/gfycat.py
new file mode 100644
index 0000000..70d9c05
--- /dev/null
+++ b/importer/providers/gfycat.py
@@ -0,0 +1,9 @@
+from importer.providers.youtube_dl_base import YoutubeDlProviderBase
+
+
+class Gfycat(YoutubeDlProviderBase):
+    regex = "^.*gfycat.com.*$"
+    _TEST = [{
+        "url": "https://gfycat.com/presentdangerousdromedary",
+        "paths": "source_presentdangerousdromedary.mp4"
+    }]
diff --git a/importer/providers/i_reddit.py b/importer/providers/i_reddit.py
new file mode 100644
index 0000000..797ce43
--- /dev/null
+++ b/importer/providers/i_reddit.py
@@ -0,0 +1,9 @@
+from importer.providers.raw_image_base import RawImageProviderBase
+
+
+class IReddit(RawImageProviderBase):
+    regex = "^.*i\\.redd\\.it.*\\.(jpg|jpeg)$"
+    _TEST = [{
+        "url": "https://i.redd.it/pjj1ll1b2rr41.jpg",
+        "paths": ["source_pjj1ll1b2rr41.jpg"]
+    }]
diff --git a/importer/providers/imgur.py b/importer/providers/imgur.py
new file mode 100644
index 0000000..dd8fb6e
--- /dev/null
+++ b/importer/providers/imgur.py
@@ -0,0 +1,9 @@
+from importer.providers.raw_image_base import RawImageProviderBase
+
+
+class Imgur(RawImageProviderBase):
+    regex = "^.*i\\.imgur\\.com.*\\.(jpg|jpeg)$"
+    _TEST = [{
+        "url": "https://i.imgur.com/fXLMjfp.jpg",
+        "paths": ["source_fXLMjfp.jpg"],
+    }]
diff --git a/importer/providers/providerbase.py b/importer/providers/providerbase.py
new file mode 100644
index 0000000..374b9af
--- /dev/null
+++ b/importer/providers/providerbase.py
@@ -0,0 +1,30 @@
+import os
+from typing import List
+
+
+class ProviderBase:
+    paths: List[str]
+    downloaded: bool
+    regex: str
+
+    _TEST = [{
+        "url": "https://i.imgur.com/fXLMjfp.jpg",
+        "paths": ["source_fXLMjfp.jpg"],
+    }]
+
+    def __init__(self, url: str):
+        self.url = url
+        self.paths = []
+        self.downloaded = False
+
+    def download(self):
+        pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.paths:
+            for path in self.paths:
+                if os.path.exists(path):
+                    os.unlink(path)
diff --git a/importer/providers/raw_image_base.py b/importer/providers/raw_image_base.py
new file mode 100644
index 0000000..267dcad
--- /dev/null
+++ b/importer/providers/raw_image_base.py
@@ -0,0 +1,27 @@
+import os
+import shutil
+from urllib.parse import urlparse
+
+import requests
+
+from importer.providers.providerbase import ProviderBase
+
+
+class RawImageProviderBase(ProviderBase):
+    regex = "^.*i.(jpg|jpeg|mp4)$"
+
+    def download(self):
+        path = self._download_raw_file(self.url)
+        self.paths.append(path)
+        self.downloaded = True
+
+    @staticmethod
+    def _download_raw_file(url: str) -> str:
+        a = urlparse(url)
+        path = f'source_{os.path.basename(a.path)}'
+        r = requests.get(url, stream=True)
+        if r.status_code == 200:
+            with open(path, 'wb') as f:
+                r.raw.decode_content = True
+                shutil.copyfileobj(r.raw, f)
+        return path
diff --git a/importer/providers/redgifs.py b/importer/providers/redgifs.py
new file mode 100644
index 0000000..e15468f
--- /dev/null
+++ b/importer/providers/redgifs.py
@@ -0,0 +1,9 @@
+from importer.providers.youtube_dl_base import YoutubeDlProviderBase
+
+
+class RedGifs(YoutubeDlProviderBase):
+    regex = "^.*redgifs\\.com.*$"
+    _TEST = [{
+        "url": "https://redgifs.com/watch/ripesnivelingfiddlercrab",
+        "paths": ["source_RipeSnivelingFiddlercrab.mp4", 'source_RipeSnivelingFiddlercrab-mobile.mp4']
+    }]
diff --git a/importer/providers/v_reddit.py b/importer/providers/v_reddit.py
new file mode 100644
index 0000000..2917fee
--- /dev/null
+++ b/importer/providers/v_reddit.py
@@ -0,0 +1,10 @@
+from importer.providers.youtube_dl_base import YoutubeDlProviderBase
+
+
+class VReddit(YoutubeDlProviderBase):
+    regex = "^.*v\\.redd\\.it.*$"
+    format = 'bestvideo+bestaudio/bestvideo'
+    _TEST = [{
+        "url": "https://v.redd.it/42j6r7i8z7151",
+        "paths": ["source_42j6r7i8z7151.mp4"]
+    }]
diff --git a/importer/providers/youtube.py b/importer/providers/youtube.py
new file mode 100644
index 0000000..d880aa0
--- /dev/null
+++ b/importer/providers/youtube.py
@@ -0,0 +1,10 @@
+from importer.providers.youtube_dl_base import YoutubeDlProviderBase
+
+
+class Youtube(YoutubeDlProviderBase):
+    regex = "^.*\\.youtube\\.com.*$"
+    format = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio"
+    _TEST = [{
+        "url": "https://www.youtube.com/watch?v=Wjrrgrvq1ew",
+        "paths": ["source_Wjrrgrvq1ew.mp4"]
+    }]
diff --git a/importer/providers/youtube_dl_base.py b/importer/providers/youtube_dl_base.py
new file mode 100644
index 0000000..3bb2fb8
--- /dev/null
+++ b/importer/providers/youtube_dl_base.py
@@ -0,0 +1,36 @@
+import os
+
+import youtube_dl
+
+from importer.providers.providerbase import ProviderBase
+
+
+class YoutubeDlProviderBase(ProviderBase):
+    regex = ".*"
+    output_template: str = 'source_%(id)s.%(ext)s'
+    format: str = "best"
+    merge_format_output: str = "mp4"
+
+    _TEST = [{
+        "url": "https://www.youtube.com/watch?v=Wjrrgrvq1ew",
+        "paths": ["source_Wjrrgrvq1ew.mp4"]
+    }]
+
+    def download(self):
+        ydl_opts = {
+            'format': self.format,
+            'merge_output_format': self.merge_format_output,
+            'outtmpl': self.output_template
+        }
+
+        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(self.url, download=True)
+            if info.get('_type', None) == 'playlist':
+                for entry in info['entries']:
+                    r = ydl.prepare_filename(entry)
+                    self.paths.append(f'{os.path.splitext(r)[0]}.mp4')
+            else:
+                r = ydl.prepare_filename(info)
+                self.paths.append(f'{os.path.splitext(r)[0]}.mp4')
+
+            self.downloaded = True
diff --git a/test/test_download.py b/test/test_download.py
deleted file mode 100644
index f13ed67..0000000
--- a/test/test_download.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-
-import pytest
-
-from importer.downloader import SourceType, Downloader
-
-@pytest.fixture
-def mock_ydl_download(mocker):
-    # this function is responsible for downloading the file
-    return mocker.patch('importer.downloader.youtube_dl.YoutubeDL.process_info')
-
-
-@pytest.mark.parametrize('url,source_type', [
-    ("https://i.redd.it/pjj1ll1b2rr41.jpg", SourceType.IREDDIT),
-    ("https://gfycat.com/presentdangerousdromedary", SourceType.GFYCAT),
-    ("https://i.imgur.com/fXLMjfp.jpg", SourceType.IMAGURJPG),
-    ("https://redgifs.com/watch/ripesnivelingfiddlercrab", SourceType.REDGIFS),
-    ("https://www.youtube.com/watch?v=oLkdqptmfng", SourceType.YOUTUBE),
-    ("https://v.redd.it/42j6r7i8z7151", SourceType.VREDDIT),
-    ("https://www.reddit.com/gallery/mik7c9", SourceType.GREDDIT),
-    ("https://duckduckgo.com", SourceType.UNKNOWN),
-])
-def test_source_type(url, source_type):
-    with Downloader(url, "1-A") as d:
-        assert d.source_type == source_type
-
-
-@pytest.mark.parametrize('url,paths', [
-    ("https://gfycat.com/presentdangerousdromedary", ["source_presentdangerousdromedary.mp4"]),
-    ("https://redgifs.com/watch/ripesnivelingfiddlercrab", ["source_RipeSnivelingFiddlercrab.mp4", 'source_RipeSnivelingFiddlercrab-mobile.mp4']),
-    ("https://www.youtube.com/watch?v=oLkdqptmfng", ["source_oLkdqptmfng.mp4"]),
-    ("https://v.redd.it/42j6r7i8z7151", ["source_42j6r7i8z7151.mp4"]),
-])
-def test_download_youtube_dl(url, paths, mock_ydl_download):
-    with Downloader(url, "1-A") as d:
-        assert d.downloaded is False
-        d.download()
-        assert d.downloaded is True
-        assert d.paths == paths
-        mock_ydl_download.assert_called()
-
-
-@pytest.mark.parametrize('url,path', [
-    ("https://i.redd.it/pjj1ll1b2rr41.jpg", "source_pjj1ll1b2rr41.jpg"),
-    ("https://i.imgur.com/fXLMjfp.jpg", "source_fXLMjfp.jpg"),
-])
-def test_download_raw_data(url, path):
-    with Downloader(url, "1-A") as d:
-        assert d.downloaded is False
-        d.download()
-        assert d.paths == [path]
-        assert d.downloaded is True
-
diff --git a/test/test_providers.py b/test/test_providers.py
new file mode 100644
index 0000000..9a5084e
--- /dev/null
+++ b/test/test_providers.py
@@ -0,0 +1,23 @@
+import praw
+import pytest
+
+from importer.downloader import Downloader
+import importer.providers as providers
+from importer.providers import ProviderBase
+
+
+@pytest.mark.parametrize("provider",
+                         [
+                             providers.IReddit,
+                             providers.Imgur,
+                             providers.RawImageProviderBase,
+                             providers.RedGifs,
+                             providers.Youtube,
+                             providers.YoutubeDlProviderBase
+                         ])
+def test_provider(provider):
+    for test in provider._TEST:
+        with provider(url=test['url']) as p:
+            p.download()
+            assert p.downloaded
+            assert p.paths == test['paths']
author	gabrielgio <gabriel.giovanini@pm.me>	2021-07-18 19:56:59 +0200
committer	gabrielgio <gabriel.giovanini@pm.me>	2021-07-18 19:56:59 +0200
commit	10cbc378ad0daf0e80f5ceed92d70fdbf573df88 (patch)
tree	a4217e75f591632ed383e334ed8e61935cd2b096
parent	b453f05d18c261d3ce3b20bb5aaa2504da562756 (diff)
download	reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.tar.gz reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.tar.bz2 reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.zip