diff options
author | gabrielgio <gabriel.giovanini@pm.me> | 2021-07-18 19:56:59 +0200 |
---|---|---|
committer | gabrielgio <gabriel.giovanini@pm.me> | 2021-07-18 19:56:59 +0200 |
commit | 10cbc378ad0daf0e80f5ceed92d70fdbf573df88 (patch) | |
tree | a4217e75f591632ed383e334ed8e61935cd2b096 | |
parent | b453f05d18c261d3ce3b20bb5aaa2504da562756 (diff) | |
download | reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.tar.gz reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.tar.bz2 reddit-nextcloud-importer-10cbc378ad0daf0e80f5ceed92d70fdbf573df88.zip |
ref: Move to OO implementation
Heavily inspired by the `youtube-dl` implementation I moved to OO
implementation where now every source type has its own class, making
easy to add new providers.
Also new it has a fallback back, where if no provider is chose it will
try to download with `YoutubeDlProvideBase`.
Add `_TEST` to each class to make it easy to add test to new providers.
-rw-r--r-- | Dockerfile | 3 | ||||
-rw-r--r-- | importer/downloader.py | 155 | ||||
-rw-r--r-- | importer/providers/__init__.py | 10 | ||||
-rw-r--r-- | importer/providers/g_reddit.py | 19 | ||||
-rw-r--r-- | importer/providers/gfycat.py | 9 | ||||
-rw-r--r-- | importer/providers/i_reddit.py | 9 | ||||
-rw-r--r-- | importer/providers/imgur.py | 9 | ||||
-rw-r--r-- | importer/providers/providerbase.py | 30 | ||||
-rw-r--r-- | importer/providers/raw_image_base.py | 27 | ||||
-rw-r--r-- | importer/providers/redgifs.py | 9 | ||||
-rw-r--r-- | importer/providers/v_reddit.py | 10 | ||||
-rw-r--r-- | importer/providers/youtube.py | 10 | ||||
-rw-r--r-- | importer/providers/youtube_dl_base.py | 36 | ||||
-rw-r--r-- | test/test_download.py | 53 | ||||
-rw-r--r-- | test/test_providers.py | 23 |
15 files changed, 212 insertions, 200 deletions
@@ -7,8 +7,7 @@ WORKDIR /opt COPY requirements.txt . RUN pip install -r requirements.txt -COPY importer/downloader.py . -COPY importer/uploader.py . +COPY importer/ . COPY main.py . diff --git a/importer/downloader.py b/importer/downloader.py index c67834b..1dd1684 100644 --- a/importer/downloader.py +++ b/importer/downloader.py @@ -1,157 +1,22 @@ -import os import re -import shutil -from enum import Enum -from urllib.parse import urlparse - -import youtube_dl -import requests from praw import Reddit - -class SourceType(Enum): - VREDDIT = 1 - IREDDIT = 2 - YOUTUBE = 4 - REDGIFS = 5 - IMAGURJPG = 6 - GFYCAT = 7 - GREDDIT = 8 - UNKNOWN = 1000 - - -OUTTMPL = 'source_%(id)s.%(ext)s' +from importer.providers import GReddit, Gfycat, IReddit, Imgur, RedGifs, VReddit, YoutubeDlProviderBase, \ + RawImageProviderBase, Youtube class Downloader: - reddit: Reddit - username: str - downloaded: bool - post_id: str - source_type: SourceType - paths: list[str] + providers = [GReddit, Gfycat, IReddit, Imgur, RedGifs, VReddit, Youtube, RawImageProviderBase, + YoutubeDlProviderBase] def __init__(self, url: str, reddit: Reddit): - self.reddit = reddit - self.downloaded = False + self.Provider = next(filter(lambda x: re.match(x.regex, url), self.providers)) self.url = url - self.source_type = self._get_source_type(url) - self.paths = [] - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.delete() + self.reddit = reddit def download(self): - try: - if self.source_type == SourceType.VREDDIT: - self._download_vreddit() - elif self.source_type == SourceType.REDGIFS: - self._download_redgifs() - elif self.source_type == SourceType.GFYCAT: - self._download_gifycat() - elif self.source_type == SourceType.YOUTUBE: - self._download_youtube() - elif self.source_type in (SourceType.IMAGURJPG, SourceType.IREDDIT): - self._download_raw_file() - elif self.source_type == SourceType.GREDDIT: - self._download_gallery_reddit() - except Exception as e: - self.downloaded = False - - def delete(self): - if self.paths: - for path in self.paths: - if os.path.exists(path): - os.unlink(path) - - def _download_youtube_dls(self, ydl_opts): - with youtube_dl.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(self.url, download=True) - if info.get('_type', None) == 'playlist': - for entry in info['entries']: - r = ydl.prepare_filename(entry) - self.paths.append(f'{os.path.splitext(r)[0]}.mp4') - else: - r = ydl.prepare_filename(info) - self.paths.append(f'{os.path.splitext(r)[0]}.mp4') - - self.downloaded = True - - def _download_redgifs(self): - ydl_opts = { - 'format': 'best', - 'merge_output_format': 'mp4', - 'outtmpl': OUTTMPL - } - self._download_youtube_dls(ydl_opts) - - def _download_gifycat(self): - ydl_opts = { - 'format': 'best', - 'merge_output_format': 'mp4', - 'outtmpl': OUTTMPL - } - self._download_youtube_dls(ydl_opts) - - def _download_vreddit(self): - ydl_opts = { - 'format': 'bestvideo+bestaudio/bestvideo', - 'merge_output_format': 'mp4', - 'outtmpl': OUTTMPL - } - self._download_youtube_dls(ydl_opts) - - def _download_youtube(self): - ydl_opts = { - 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio', - 'merge_output_format': 'mp4', - 'outtmpl': OUTTMPL - } - self._download_youtube_dls(ydl_opts) - - def _download_raw_file(self): - a = urlparse(self.url) - path = f'source_{os.path.basename(a.path)}' - - r = requests.get(self.url, stream=True) - if r.status_code == 200: - self.downloaded = True - with open(path, 'wb') as f: - r.raw.decode_content = True - shutil.copyfileobj(r.raw, f) - self.paths.append(path) - else: - self.downloaded = False - - def _download_gallery_reddit(self): - url = self.url - submission = self.reddit.submission(url=self.url) - for key in submission.media_metadata: - value = submission.media_metadata[key] - self.url = value['s']['u'] - self._download_raw_file() - - self.url = url - - @staticmethod - def _get_source_type(url): - if re.match("^.*v\\.redd\\.it.*$", url): - return SourceType.VREDDIT - if re.match("^.*i\\.redd\\.it.*\\.(jpg|jpeg)$", url): - return SourceType.IREDDIT - if re.match("^.*\\.youtube\\.com.*$", url): - return SourceType.YOUTUBE - if re.match("^.*redgifs\\.com.*$", url): - return SourceType.REDGIFS - if re.match("^.*i\\.imgur\\.com.*\\.(jpg|jpeg)$", url): - return SourceType.IMAGURJPG - if re.match("^.*gfycat.com.*$", url): - return SourceType.GFYCAT - if re.match("^.*www.reddit.com/gallery.*$", url): - return SourceType.GREDDIT - - return SourceType.UNKNOWN + with self.Provider(url=self.url, reddit=self.reddit) as provider: + provider.download() + self.paths = provider.paths + self.downloaded = provider.downloaded diff --git a/importer/providers/__init__.py b/importer/providers/__init__.py new file mode 100644 index 0000000..62c2d85 --- /dev/null +++ b/importer/providers/__init__.py @@ -0,0 +1,10 @@ +from .g_reddit import GReddit +from .providerbase import ProviderBase +from .gfycat import Gfycat +from .i_reddit import IReddit +from .imgur import Imgur +from .raw_image_base import RawImageProviderBase +from .redgifs import RedGifs +from .v_reddit import VReddit +from .youtube import Youtube +from .youtube_dl_base import YoutubeDlProviderBase diff --git a/importer/providers/g_reddit.py b/importer/providers/g_reddit.py new file mode 100644 index 0000000..53ee5df --- /dev/null +++ b/importer/providers/g_reddit.py @@ -0,0 +1,19 @@ +from praw import Reddit + +from importer.providers.raw_image_base import RawImageProviderBase + + +class GReddit(RawImageProviderBase): + regex = "^.*www.reddit.com/gallery.*$" + + def __init__(self, url: str, reddit: Reddit): + super(GReddit, self).__init__(url) + self.reddit = reddit + + def download(self): + submission = self.reddit.submission(url=self.url) + for key in submission.media_metadata: + value = submission.media_metadata[key] + url = value['s']['u'] + path = self._download_raw_file(url) + self.paths.append(path) diff --git a/importer/providers/gfycat.py b/importer/providers/gfycat.py new file mode 100644 index 0000000..70d9c05 --- /dev/null +++ b/importer/providers/gfycat.py @@ -0,0 +1,9 @@ +from importer.providers.youtube_dl_base import YoutubeDlProviderBase + + +class Gfycat(YoutubeDlProviderBase): + regex = "^.*gfycat.com.*$" + _TEST = [{ + "url": "https://gfycat.com/presentdangerousdromedary", + "paths": "source_presentdangerousdromedary.mp4" + }] diff --git a/importer/providers/i_reddit.py b/importer/providers/i_reddit.py new file mode 100644 index 0000000..797ce43 --- /dev/null +++ b/importer/providers/i_reddit.py @@ -0,0 +1,9 @@ +from importer.providers.raw_image_base import RawImageProviderBase + + +class IReddit(RawImageProviderBase): + regex = "^.*i\\.redd\\.it.*\\.(jpg|jpeg)$" + _TEST = [{ + "url": "https://i.redd.it/pjj1ll1b2rr41.jpg", + "paths": ["source_pjj1ll1b2rr41.jpg"] + }] diff --git a/importer/providers/imgur.py b/importer/providers/imgur.py new file mode 100644 index 0000000..dd8fb6e --- /dev/null +++ b/importer/providers/imgur.py @@ -0,0 +1,9 @@ +from importer.providers.raw_image_base import RawImageProviderBase + + +class Imgur(RawImageProviderBase): + regex = "^.*i\\.imgur\\.com.*\\.(jpg|jpeg)$" + _TEST = [{ + "url": "https://i.imgur.com/fXLMjfp.jpg", + "paths": ["source_fXLMjfp.jpg"], + }] diff --git a/importer/providers/providerbase.py b/importer/providers/providerbase.py new file mode 100644 index 0000000..374b9af --- /dev/null +++ b/importer/providers/providerbase.py @@ -0,0 +1,30 @@ +import os +from typing import List + + +class ProviderBase: + paths: List[str] + downloaded: bool + regex: str + + _TEST = [{ + "url": "https://i.imgur.com/fXLMjfp.jpg", + "paths": ["source_fXLMjfp.jpg"], + }] + + def __init__(self, url: str): + self.url = url + self.paths = [] + self.downloaded = False + + def download(self): + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.paths: + for path in self.paths: + if os.path.exists(path): + os.unlink(path) diff --git a/importer/providers/raw_image_base.py b/importer/providers/raw_image_base.py new file mode 100644 index 0000000..267dcad --- /dev/null +++ b/importer/providers/raw_image_base.py @@ -0,0 +1,27 @@ +import os +import shutil +from urllib.parse import urlparse + +import requests + +from importer.providers.providerbase import ProviderBase + + +class RawImageProviderBase(ProviderBase): + regex = "^.*i.(jpg|jpeg|mp4)$" + + def download(self): + path = self._download_raw_file(self.url) + self.paths.append(path) + self.downloaded = True + + @staticmethod + def _download_raw_file(url: str) -> str: + a = urlparse(url) + path = f'source_{os.path.basename(a.path)}' + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(path, 'wb') as f: + r.raw.decode_content = True + shutil.copyfileobj(r.raw, f) + return path diff --git a/importer/providers/redgifs.py b/importer/providers/redgifs.py new file mode 100644 index 0000000..e15468f --- /dev/null +++ b/importer/providers/redgifs.py @@ -0,0 +1,9 @@ +from importer.providers.youtube_dl_base import YoutubeDlProviderBase + + +class RedGifs(YoutubeDlProviderBase): + regex = "^.*redgifs\\.com.*$" + _TEST = [{ + "url": "https://redgifs.com/watch/ripesnivelingfiddlercrab", + "paths": ["source_RipeSnivelingFiddlercrab.mp4", 'source_RipeSnivelingFiddlercrab-mobile.mp4'] + }] diff --git a/importer/providers/v_reddit.py b/importer/providers/v_reddit.py new file mode 100644 index 0000000..2917fee --- /dev/null +++ b/importer/providers/v_reddit.py @@ -0,0 +1,10 @@ +from importer.providers.youtube_dl_base import YoutubeDlProviderBase + + +class VReddit(YoutubeDlProviderBase): + regex = "^.*v\\.redd\\.it.*$" + format = 'bestvideo+bestaudio/bestvideo' + _TEST = [{ + "url": "https://v.redd.it/42j6r7i8z7151", + "paths": ["source_42j6r7i8z7151.mp4"] + }] diff --git a/importer/providers/youtube.py b/importer/providers/youtube.py new file mode 100644 index 0000000..d880aa0 --- /dev/null +++ b/importer/providers/youtube.py @@ -0,0 +1,10 @@ +from importer.providers.youtube_dl_base import YoutubeDlProviderBase + + +class Youtube(YoutubeDlProviderBase): + regex = "^.*\\.youtube\\.com.*$" + format = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio" + _TEST = [{ + "url": "https://www.youtube.com/watch?v=Wjrrgrvq1ew", + "paths": ["source_Wjrrgrvq1ew.mp4"] + }] diff --git a/importer/providers/youtube_dl_base.py b/importer/providers/youtube_dl_base.py new file mode 100644 index 0000000..3bb2fb8 --- /dev/null +++ b/importer/providers/youtube_dl_base.py @@ -0,0 +1,36 @@ +import os + +import youtube_dl + +from importer.providers.providerbase import ProviderBase + + +class YoutubeDlProviderBase(ProviderBase): + regex = ".*" + output_template: str = 'source_%(id)s.%(ext)s' + format: str = "best" + merge_format_output: str = "mp4" + + _TEST = [{ + "url": "https://www.youtube.com/watch?v=Wjrrgrvq1ew", + "paths": ["source_Wjrrgrvq1ew.mp4"] + }] + + def download(self): + ydl_opts = { + 'format': self.format, + 'merge_output_format': self.merge_format_output, + 'outtmpl': self.output_template + } + + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(self.url, download=True) + if info.get('_type', None) == 'playlist': + for entry in info['entries']: + r = ydl.prepare_filename(entry) + self.paths.append(f'{os.path.splitext(r)[0]}.mp4') + else: + r = ydl.prepare_filename(info) + self.paths.append(f'{os.path.splitext(r)[0]}.mp4') + + self.downloaded = True diff --git a/test/test_download.py b/test/test_download.py deleted file mode 100644 index f13ed67..0000000 --- a/test/test_download.py +++ /dev/null @@ -1,53 +0,0 @@ -import os - -import pytest - -from importer.downloader import SourceType, Downloader - -@pytest.fixture -def mock_ydl_download(mocker): - # this function is responsible for downloading the file - return mocker.patch('importer.downloader.youtube_dl.YoutubeDL.process_info') - - -@pytest.mark.parametrize('url,source_type', [ - ("https://i.redd.it/pjj1ll1b2rr41.jpg", SourceType.IREDDIT), - ("https://gfycat.com/presentdangerousdromedary", SourceType.GFYCAT), - ("https://i.imgur.com/fXLMjfp.jpg", SourceType.IMAGURJPG), - ("https://redgifs.com/watch/ripesnivelingfiddlercrab", SourceType.REDGIFS), - ("https://www.youtube.com/watch?v=oLkdqptmfng", SourceType.YOUTUBE), - ("https://v.redd.it/42j6r7i8z7151", SourceType.VREDDIT), - ("https://www.reddit.com/gallery/mik7c9", SourceType.GREDDIT), - ("https://duckduckgo.com", SourceType.UNKNOWN), -]) -def test_source_type(url, source_type): - with Downloader(url, "1-A") as d: - assert d.source_type == source_type - - -@pytest.mark.parametrize('url,paths', [ - ("https://gfycat.com/presentdangerousdromedary", ["source_presentdangerousdromedary.mp4"]), - ("https://redgifs.com/watch/ripesnivelingfiddlercrab", ["source_RipeSnivelingFiddlercrab.mp4", 'source_RipeSnivelingFiddlercrab-mobile.mp4']), - ("https://www.youtube.com/watch?v=oLkdqptmfng", ["source_oLkdqptmfng.mp4"]), - ("https://v.redd.it/42j6r7i8z7151", ["source_42j6r7i8z7151.mp4"]), -]) -def test_download_youtube_dl(url, paths, mock_ydl_download): - with Downloader(url, "1-A") as d: - assert d.downloaded is False - d.download() - assert d.downloaded is True - assert d.paths == paths - mock_ydl_download.assert_called() - - -@pytest.mark.parametrize('url,path', [ - ("https://i.redd.it/pjj1ll1b2rr41.jpg", "source_pjj1ll1b2rr41.jpg"), - ("https://i.imgur.com/fXLMjfp.jpg", "source_fXLMjfp.jpg"), -]) -def test_download_raw_data(url, path): - with Downloader(url, "1-A") as d: - assert d.downloaded is False - d.download() - assert d.paths == [path] - assert d.downloaded is True - diff --git a/test/test_providers.py b/test/test_providers.py new file mode 100644 index 0000000..9a5084e --- /dev/null +++ b/test/test_providers.py @@ -0,0 +1,23 @@ +import praw +import pytest + +from importer.downloader import Downloader +import importer.providers as providers +from importer.providers import ProviderBase + + +@pytest.mark.parametrize("provider", + [ + providers.IReddit, + providers.Imgur, + providers.RawImageProviderBase, + providers.RedGifs, + providers.Youtube, + providers.YoutubeDlProviderBase + ]) +def test_provider(provider): + for test in provider._TEST: + with provider(url=test['url']) as p: + p.download() + assert p.downloaded + assert p.paths == test['paths'] |