From e04d89a1278f66c15be9cadb6ab524dbb878efd8 Mon Sep 17 00:00:00 2001 From: gabrielgio Date: Sat, 3 Jul 2021 21:24:15 +0200 Subject: Initial commit --- .gitignore | 5 ++ .gitlab-ci.yml | 45 +++++++++++++++ Dockerfile | 15 +++++ dev-requirements.txt | 5 ++ download.py | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 +++ test_download.py | 79 ++++++++++++++++++++++++++ user.py | 48 ++++++++++++++++ util.py | 40 +++++++++++++ 9 files changed, 402 insertions(+) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 Dockerfile create mode 100644 dev-requirements.txt create mode 100644 download.py create mode 100644 requirements.txt create mode 100644 test_download.py create mode 100644 user.py create mode 100644 util.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f2b59a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.pytest_cache/ +.venv/ +.coverage +.idea/ +__pycache__/ \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..795dbfb --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,45 @@ +stages: + - test + - build + +docker-build-master: + image: docker:latest + stage: build + services: + - docker:dind + before_script: + - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + script: + - docker build --pull -t "$CI_REGISTRY_IMAGE" -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" . + - docker push "$CI_REGISTRY_IMAGE" + - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" + only: + - master + +docker-build: + image: docker:latest + stage: build + services: + - docker:dind + before_script: + - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + script: + - docker build --pull -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" . + - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" + - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" + except: + - master + + +test: + image: python:3-alpine + stage: test + script: + - pip install -r dev-requirements.txt + - pytest -n 4 --junitxml=report.xml + - coverage run -m pytest + - coverage xml -o cobertura.xml + artifacts: + reports: + cobertura: cobertura.xml + junit: report.xml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7d5fbaf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3-alpine + +RUN apk add --no-cache ffmpeg + +WORKDIR /opt + +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY download.py . +COPY user.py . +COPY util.py . + + +ENTRYPOINT python user.py \ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..dec7c39 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,5 @@ +-r ./requirements.txt +pytest +pytest-mock +pytest-xdist +coverage \ No newline at end of file diff --git a/download.py b/download.py new file mode 100644 index 0000000..c67834b --- /dev/null +++ b/download.py @@ -0,0 +1,157 @@ +import os +import re +import shutil +from enum import Enum +from urllib.parse import urlparse + +import youtube_dl +import requests + +from praw import Reddit + + +class SourceType(Enum): + VREDDIT = 1 + IREDDIT = 2 + YOUTUBE = 4 + REDGIFS = 5 + IMAGURJPG = 6 + GFYCAT = 7 + GREDDIT = 8 + UNKNOWN = 1000 + + +OUTTMPL = 'source_%(id)s.%(ext)s' + + +class Downloader: + reddit: Reddit + username: str + downloaded: bool + post_id: str + source_type: SourceType + paths: list[str] + + def __init__(self, url: str, reddit: Reddit): + self.reddit = reddit + self.downloaded = False + self.url = url + self.source_type = self._get_source_type(url) + self.paths = [] + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.delete() + + def download(self): + try: + if self.source_type == SourceType.VREDDIT: + self._download_vreddit() + elif self.source_type == SourceType.REDGIFS: + self._download_redgifs() + elif self.source_type == SourceType.GFYCAT: + self._download_gifycat() + elif self.source_type == SourceType.YOUTUBE: + self._download_youtube() + elif self.source_type in (SourceType.IMAGURJPG, SourceType.IREDDIT): + self._download_raw_file() + elif self.source_type == SourceType.GREDDIT: + self._download_gallery_reddit() + except Exception as e: + self.downloaded = False + + def delete(self): + if self.paths: + for path in self.paths: + if os.path.exists(path): + os.unlink(path) + + def _download_youtube_dls(self, ydl_opts): + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(self.url, download=True) + if info.get('_type', None) == 'playlist': + for entry in info['entries']: + r = ydl.prepare_filename(entry) + self.paths.append(f'{os.path.splitext(r)[0]}.mp4') + else: + r = ydl.prepare_filename(info) + self.paths.append(f'{os.path.splitext(r)[0]}.mp4') + + self.downloaded = True + + def _download_redgifs(self): + ydl_opts = { + 'format': 'best', + 'merge_output_format': 'mp4', + 'outtmpl': OUTTMPL + } + self._download_youtube_dls(ydl_opts) + + def _download_gifycat(self): + ydl_opts = { + 'format': 'best', + 'merge_output_format': 'mp4', + 'outtmpl': OUTTMPL + } + self._download_youtube_dls(ydl_opts) + + def _download_vreddit(self): + ydl_opts = { + 'format': 'bestvideo+bestaudio/bestvideo', + 'merge_output_format': 'mp4', + 'outtmpl': OUTTMPL + } + self._download_youtube_dls(ydl_opts) + + def _download_youtube(self): + ydl_opts = { + 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio', + 'merge_output_format': 'mp4', + 'outtmpl': OUTTMPL + } + self._download_youtube_dls(ydl_opts) + + def _download_raw_file(self): + a = urlparse(self.url) + path = f'source_{os.path.basename(a.path)}' + + r = requests.get(self.url, stream=True) + if r.status_code == 200: + self.downloaded = True + with open(path, 'wb') as f: + r.raw.decode_content = True + shutil.copyfileobj(r.raw, f) + self.paths.append(path) + else: + self.downloaded = False + + def _download_gallery_reddit(self): + url = self.url + submission = self.reddit.submission(url=self.url) + for key in submission.media_metadata: + value = submission.media_metadata[key] + self.url = value['s']['u'] + self._download_raw_file() + + self.url = url + + @staticmethod + def _get_source_type(url): + if re.match("^.*v\\.redd\\.it.*$", url): + return SourceType.VREDDIT + if re.match("^.*i\\.redd\\.it.*\\.(jpg|jpeg)$", url): + return SourceType.IREDDIT + if re.match("^.*\\.youtube\\.com.*$", url): + return SourceType.YOUTUBE + if re.match("^.*redgifs\\.com.*$", url): + return SourceType.REDGIFS + if re.match("^.*i\\.imgur\\.com.*\\.(jpg|jpeg)$", url): + return SourceType.IMAGURJPG + if re.match("^.*gfycat.com.*$", url): + return SourceType.GFYCAT + if re.match("^.*www.reddit.com/gallery.*$", url): + return SourceType.GREDDIT + + return SourceType.UNKNOWN diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cb804f0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +kafka-python +youtube-dl +requests +praw +pika +jsonpickle +requests +nextcloud-api-wrapper diff --git a/test_download.py b/test_download.py new file mode 100644 index 0000000..6f86bbc --- /dev/null +++ b/test_download.py @@ -0,0 +1,79 @@ +import os + +import pytest + +from download import SourceType, Downloader + +reddit_env = pytest.mark.skipif( + os.environ.get('CLIENT_ID', '') == '' or + os.environ.get('CLIENT_SECRET', '') == '' or + os.environ.get('USERNAME', '') == '' or + os.environ.get('PASSWORD', '') == '' + , reason="Require reddit env variables to be set." +) + + +@pytest.fixture +def mock_ydl_download(mocker): + # this function is responsible for downloading the file + return mocker.patch('download.youtube_dl.YoutubeDL.process_info') + + +@pytest.mark.parametrize('url,source_type', [ + ("https://i.redd.it/pjj1ll1b2rr41.jpg", SourceType.IREDDIT), + ("https://gfycat.com/presentdangerousdromedary", SourceType.GFYCAT), + ("https://i.imgur.com/fXLMjfp.jpg", SourceType.IMAGURJPG), + ("https://redgifs.com/watch/ripesnivelingfiddlercrab", SourceType.REDGIFS), + ("https://www.youtube.com/watch?v=oLkdqptmfng", SourceType.YOUTUBE), + ("https://v.redd.it/42j6r7i8z7151", SourceType.VREDDIT), + ("https://www.reddit.com/gallery/mik7c9", SourceType.GREDDIT), + ("https://duckduckgo.com", SourceType.UNKNOWN), +]) +def test_source_type(url, source_type): + with Downloader(url, "1-A") as d: + assert d.source_type == source_type + + +@pytest.mark.parametrize('url,paths', [ + ("https://gfycat.com/presentdangerousdromedary", ["source_presentdangerousdromedary.mp4"]), + ("https://redgifs.com/watch/ripesnivelingfiddlercrab", ["source_RipeSnivelingFiddlercrab.mp4", 'source_RipeSnivelingFiddlercrab-mobile.mp4']), + ("https://www.youtube.com/watch?v=oLkdqptmfng", ["source_oLkdqptmfng.mp4"]), + ("https://v.redd.it/42j6r7i8z7151", ["source_42j6r7i8z7151.mp4"]), +]) +def test_download_youtube_dl(url, paths, mock_ydl_download): + with Downloader(url, "1-A") as d: + assert d.downloaded is False + d.download() + assert d.downloaded is True + assert d.paths == paths + mock_ydl_download.assert_called() + + +@pytest.mark.parametrize('url,path', [ + ("https://i.redd.it/pjj1ll1b2rr41.jpg", "source_pjj1ll1b2rr41.jpg"), + ("https://i.imgur.com/fXLMjfp.jpg", "source_fXLMjfp.jpg"), +]) +def test_download_raw_data(url, path): + with Downloader(url, "1-A") as d: + assert d.downloaded is False + d.download() + assert d.paths == [path] + assert d.downloaded is True + + +@reddit_env +def test_praw_download(): + client_id = os.environ.get('CLIENT_ID', '') + client_secret = os.environ.get('CLIENT_SECRET', '') + username = os.environ.get('USERNAME', '') + password = os.environ.get('PASSWORD', '') + files = {'source_hlokpsyhgrq61.jpg', 'source_n31c2y7igrq61.jpg', 'source_7eg0o76igrq61.jpg', + 'source_whl12jbigrq61.jpg', 'source_4uok762igrq61.jpg', 'source_t3pgm64igrq61.jpg', + 'source_ymc4hv9igrq61.jpg'} + + with Downloader("https://www.reddit.com/gallery/mik7c9", "1-A", client_id=client_id, client_secret=client_secret, + password=password, user_agent="hcrawler", username=username) as d: + assert d.downloaded is False + d.download() + assert d.downloaded is True + assert set(d.paths) == files diff --git a/user.py b/user.py new file mode 100644 index 0000000..e386c2d --- /dev/null +++ b/user.py @@ -0,0 +1,48 @@ +import praw +from time import sleep + +from nextcloud import NextCloud +from praw.models.util import stream_generator + +from download import Downloader +from util import jsonfy, try_post, parser + +if __name__ == "__main__": + args = parser.parse_args() + reddit = praw.Reddit(client_id=args.client_id, + client_secret=args.client_secret, + password=args.reddit_password, + user_agent="hcrawler", + username=args.reddit_username) + + nxc = NextCloud( + args.nextcloud_host, + user=args.nextcloud_username, + password=args.nextcloud_password, + session_kwargs={'verify': False} + ) + + nxc.create_folder(f"im", True) + + redditor = reddit.redditor(args.reddit_username) + + + def uplaod(post): + url = post.url + nxc.create_folder(f"im/{post.subreddit}/", True) + with Downloader(url=url, reddit=reddit) as d: + d.download() + for path in d.paths: + if "-mobile" in path: # Remove mobile version + continue + nxc.upload_file(path, f"im/{post.subreddit}/{path}") + + + for post in redditor.saved(limit=None): + uplaod(post) + + sleep(60) + + generator = stream_generator(redditor.saved, attribute_name="name") + for post in generator: + uplaod(post) diff --git a/util.py b/util.py new file mode 100644 index 0000000..cd80506 --- /dev/null +++ b/util.py @@ -0,0 +1,40 @@ +import argparse +import json +import os +from time import sleep +from typing import Dict + +import jsonpickle +import requests + +headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} + +parser = argparse.ArgumentParser(description="Monitor saved") +parser.add_argument('-c', '--client-id', help="Reddit client id", default=os.environ.get('CLIENT_ID', '')) +parser.add_argument('-s', '--client-secret', help="Reddit client secret", default=os.environ.get('CLIENT_SECRET', '')) +parser.add_argument('-u', '--reddit-username', help="Reddit username", default=os.environ.get('REDDIT_USERNAME', '')) +parser.add_argument('-p', '--reddit-password', help="Reddit user password", default=os.environ.get('REDDIT_PASSWORD', '')) +parser.add_argument('-P', '--nextcloud-password', help="Nextcloud Password", default=os.environ.get('NEXTCLOUD_PASSWORD', '')) +parser.add_argument('-U', '--nextcloud-username', help="Nextcloud Username", default=os.environ.get('NEXTCLOUD_USERNAME', '')) +parser.add_argument('-o', '--nextcloud-host', help="Nextcloud Host", default=os.environ.get('NEXTCLOUD_HOST', 'localhost')) + + +def try_post(url, json_string, count=0): + try: + if count > 10: + return + r = requests.post(url, data=json_string, headers=headers) + if r.status_code != 200: + sleep(60 * count) + try_post(url, json_string, count + 1) + except: + sleep(60 * count) + try_post(url, json_string, count + 1) + + +def jsonfy(post): + json_string = jsonpickle.encode(post) + json_dict: Dict = json.loads(json_string) + json_dict.pop('_reddit') + json_dict.pop('py/object') + return json.dumps(json_dict) -- cgit v1.2.3