diff options
| -rw-r--r-- | .gitignore | 5 | ||||
| -rw-r--r-- | .gitlab-ci.yml | 45 | ||||
| -rw-r--r-- | Dockerfile | 15 | ||||
| -rw-r--r-- | dev-requirements.txt | 5 | ||||
| -rw-r--r-- | download.py | 157 | ||||
| -rw-r--r-- | requirements.txt | 8 | ||||
| -rw-r--r-- | test_download.py | 79 | ||||
| -rw-r--r-- | user.py | 48 | ||||
| -rw-r--r-- | util.py | 40 | 
9 files changed, 402 insertions, 0 deletions
| diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f2b59a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.pytest_cache/ +.venv/ +.coverage +.idea/ +__pycache__/
\ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..795dbfb --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,45 @@ +stages: +  - test +  - build + +docker-build-master: +  image: docker:latest +  stage: build +  services: +    - docker:dind +  before_script: +    - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY +  script: +    - docker build --pull -t "$CI_REGISTRY_IMAGE" -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" . +    - docker push "$CI_REGISTRY_IMAGE" +    - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" +  only: +    - master + +docker-build: +  image: docker:latest +  stage: build +  services: +    - docker:dind +  before_script: +    - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY +  script: +    - docker build --pull -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" . +    - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" +    - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA" +  except: +    - master + + +test: +  image: python:3-alpine +  stage: test +  script: +    - pip install -r dev-requirements.txt +    - pytest -n 4 --junitxml=report.xml +    - coverage run -m pytest +    - coverage xml -o cobertura.xml +  artifacts: +    reports: +      cobertura: cobertura.xml +      junit: report.xml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7d5fbaf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3-alpine + +RUN apk add --no-cache ffmpeg + +WORKDIR /opt + +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY download.py . +COPY user.py . +COPY util.py . + + +ENTRYPOINT python user.py
\ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..dec7c39 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,5 @@ +-r ./requirements.txt +pytest +pytest-mock +pytest-xdist +coverage
\ No newline at end of file diff --git a/download.py b/download.py new file mode 100644 index 0000000..c67834b --- /dev/null +++ b/download.py @@ -0,0 +1,157 @@ +import os +import re +import shutil +from enum import Enum +from urllib.parse import urlparse + +import youtube_dl +import requests + +from praw import Reddit + + +class SourceType(Enum): +    VREDDIT = 1 +    IREDDIT = 2 +    YOUTUBE = 4 +    REDGIFS = 5 +    IMAGURJPG = 6 +    GFYCAT = 7 +    GREDDIT = 8 +    UNKNOWN = 1000 + + +OUTTMPL = 'source_%(id)s.%(ext)s' + + +class Downloader: +    reddit: Reddit +    username: str +    downloaded: bool +    post_id: str +    source_type: SourceType +    paths: list[str] + +    def __init__(self, url: str, reddit: Reddit): +        self.reddit = reddit +        self.downloaded = False +        self.url = url +        self.source_type = self._get_source_type(url) +        self.paths = [] + +    def __enter__(self): +        return self + +    def __exit__(self, exc_type, exc_val, exc_tb): +        self.delete() + +    def download(self): +        try: +            if self.source_type == SourceType.VREDDIT: +                self._download_vreddit() +            elif self.source_type == SourceType.REDGIFS: +                self._download_redgifs() +            elif self.source_type == SourceType.GFYCAT: +                self._download_gifycat() +            elif self.source_type == SourceType.YOUTUBE: +                self._download_youtube() +            elif self.source_type in (SourceType.IMAGURJPG, SourceType.IREDDIT): +                self._download_raw_file() +            elif self.source_type == SourceType.GREDDIT: +                self._download_gallery_reddit() +        except Exception as e: +            self.downloaded = False + +    def delete(self): +        if self.paths: +            for path in self.paths: +                if os.path.exists(path): +                    os.unlink(path) + +    def _download_youtube_dls(self, ydl_opts): +        with youtube_dl.YoutubeDL(ydl_opts) as ydl: +            info = ydl.extract_info(self.url, download=True) +            if info.get('_type', None) == 'playlist': +                for entry in info['entries']: +                    r = ydl.prepare_filename(entry) +                    self.paths.append(f'{os.path.splitext(r)[0]}.mp4') +            else: +                r = ydl.prepare_filename(info) +                self.paths.append(f'{os.path.splitext(r)[0]}.mp4') + +            self.downloaded = True + +    def _download_redgifs(self): +        ydl_opts = { +            'format': 'best', +            'merge_output_format': 'mp4', +            'outtmpl': OUTTMPL +        } +        self._download_youtube_dls(ydl_opts) + +    def _download_gifycat(self): +        ydl_opts = { +            'format': 'best', +            'merge_output_format': 'mp4', +            'outtmpl': OUTTMPL +        } +        self._download_youtube_dls(ydl_opts) + +    def _download_vreddit(self): +        ydl_opts = { +            'format': 'bestvideo+bestaudio/bestvideo', +            'merge_output_format': 'mp4', +            'outtmpl': OUTTMPL +        } +        self._download_youtube_dls(ydl_opts) + +    def _download_youtube(self): +        ydl_opts = { +            'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio', +            'merge_output_format': 'mp4', +            'outtmpl': OUTTMPL +        } +        self._download_youtube_dls(ydl_opts) + +    def _download_raw_file(self): +        a = urlparse(self.url) +        path = f'source_{os.path.basename(a.path)}' + +        r = requests.get(self.url, stream=True) +        if r.status_code == 200: +            self.downloaded = True +            with open(path, 'wb') as f: +                r.raw.decode_content = True +                shutil.copyfileobj(r.raw, f) +            self.paths.append(path) +        else: +            self.downloaded = False + +    def _download_gallery_reddit(self): +        url = self.url +        submission = self.reddit.submission(url=self.url) +        for key in submission.media_metadata: +            value = submission.media_metadata[key] +            self.url = value['s']['u'] +            self._download_raw_file() + +        self.url = url + +    @staticmethod +    def _get_source_type(url): +        if re.match("^.*v\\.redd\\.it.*$", url): +            return SourceType.VREDDIT +        if re.match("^.*i\\.redd\\.it.*\\.(jpg|jpeg)$", url): +            return SourceType.IREDDIT +        if re.match("^.*\\.youtube\\.com.*$", url): +            return SourceType.YOUTUBE +        if re.match("^.*redgifs\\.com.*$", url): +            return SourceType.REDGIFS +        if re.match("^.*i\\.imgur\\.com.*\\.(jpg|jpeg)$", url): +            return SourceType.IMAGURJPG +        if re.match("^.*gfycat.com.*$", url): +            return SourceType.GFYCAT +        if re.match("^.*www.reddit.com/gallery.*$", url): +            return SourceType.GREDDIT + +        return SourceType.UNKNOWN diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cb804f0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +kafka-python +youtube-dl +requests +praw +pika +jsonpickle +requests +nextcloud-api-wrapper diff --git a/test_download.py b/test_download.py new file mode 100644 index 0000000..6f86bbc --- /dev/null +++ b/test_download.py @@ -0,0 +1,79 @@ +import os + +import pytest + +from download import SourceType, Downloader + +reddit_env = pytest.mark.skipif( +    os.environ.get('CLIENT_ID', '') == '' or +    os.environ.get('CLIENT_SECRET', '') == '' or +    os.environ.get('USERNAME', '') == '' or +    os.environ.get('PASSWORD', '') == '' +    , reason="Require reddit env variables to be set." +) + + +@pytest.fixture +def mock_ydl_download(mocker): +    # this function is responsible for downloading the file +    return mocker.patch('download.youtube_dl.YoutubeDL.process_info') + + +@pytest.mark.parametrize('url,source_type', [ +    ("https://i.redd.it/pjj1ll1b2rr41.jpg", SourceType.IREDDIT), +    ("https://gfycat.com/presentdangerousdromedary", SourceType.GFYCAT), +    ("https://i.imgur.com/fXLMjfp.jpg", SourceType.IMAGURJPG), +    ("https://redgifs.com/watch/ripesnivelingfiddlercrab", SourceType.REDGIFS), +    ("https://www.youtube.com/watch?v=oLkdqptmfng", SourceType.YOUTUBE), +    ("https://v.redd.it/42j6r7i8z7151", SourceType.VREDDIT), +    ("https://www.reddit.com/gallery/mik7c9", SourceType.GREDDIT), +    ("https://duckduckgo.com", SourceType.UNKNOWN), +]) +def test_source_type(url, source_type): +    with Downloader(url, "1-A") as d: +        assert d.source_type == source_type + + +@pytest.mark.parametrize('url,paths', [ +    ("https://gfycat.com/presentdangerousdromedary", ["source_presentdangerousdromedary.mp4"]), +    ("https://redgifs.com/watch/ripesnivelingfiddlercrab", ["source_RipeSnivelingFiddlercrab.mp4", 'source_RipeSnivelingFiddlercrab-mobile.mp4']), +    ("https://www.youtube.com/watch?v=oLkdqptmfng", ["source_oLkdqptmfng.mp4"]), +    ("https://v.redd.it/42j6r7i8z7151", ["source_42j6r7i8z7151.mp4"]), +]) +def test_download_youtube_dl(url, paths, mock_ydl_download): +    with Downloader(url, "1-A") as d: +        assert d.downloaded is False +        d.download() +        assert d.downloaded is True +        assert d.paths == paths +        mock_ydl_download.assert_called() + + +@pytest.mark.parametrize('url,path', [ +    ("https://i.redd.it/pjj1ll1b2rr41.jpg", "source_pjj1ll1b2rr41.jpg"), +    ("https://i.imgur.com/fXLMjfp.jpg", "source_fXLMjfp.jpg"), +]) +def test_download_raw_data(url, path): +    with Downloader(url, "1-A") as d: +        assert d.downloaded is False +        d.download() +        assert d.paths == [path] +        assert d.downloaded is True + + +@reddit_env +def test_praw_download(): +    client_id = os.environ.get('CLIENT_ID', '') +    client_secret = os.environ.get('CLIENT_SECRET', '') +    username = os.environ.get('USERNAME', '') +    password = os.environ.get('PASSWORD', '') +    files = {'source_hlokpsyhgrq61.jpg', 'source_n31c2y7igrq61.jpg', 'source_7eg0o76igrq61.jpg', +             'source_whl12jbigrq61.jpg', 'source_4uok762igrq61.jpg', 'source_t3pgm64igrq61.jpg', +             'source_ymc4hv9igrq61.jpg'} + +    with Downloader("https://www.reddit.com/gallery/mik7c9", "1-A", client_id=client_id, client_secret=client_secret, +                    password=password, user_agent="hcrawler", username=username) as d: +        assert d.downloaded is False +        d.download() +        assert d.downloaded is True +        assert set(d.paths) == files @@ -0,0 +1,48 @@ +import praw +from time import sleep + +from nextcloud import NextCloud +from praw.models.util import stream_generator + +from download import Downloader +from util import jsonfy, try_post, parser + +if __name__ == "__main__": +    args = parser.parse_args() +    reddit = praw.Reddit(client_id=args.client_id, +                         client_secret=args.client_secret, +                         password=args.reddit_password, +                         user_agent="hcrawler", +                         username=args.reddit_username) + +    nxc = NextCloud( +        args.nextcloud_host, +        user=args.nextcloud_username, +        password=args.nextcloud_password, +        session_kwargs={'verify': False} +    ) + +    nxc.create_folder(f"im", True) + +    redditor = reddit.redditor(args.reddit_username) + + +    def uplaod(post): +        url = post.url +        nxc.create_folder(f"im/{post.subreddit}/", True) +        with Downloader(url=url, reddit=reddit) as d: +            d.download() +            for path in d.paths: +                if "-mobile" in path:  # Remove mobile version +                    continue +                nxc.upload_file(path, f"im/{post.subreddit}/{path}") + + +    for post in redditor.saved(limit=None): +        uplaod(post) + +    sleep(60) + +    generator = stream_generator(redditor.saved, attribute_name="name") +    for post in generator: +        uplaod(post) @@ -0,0 +1,40 @@ +import argparse +import json +import os +from time import sleep +from typing import Dict + +import jsonpickle +import requests + +headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} + +parser = argparse.ArgumentParser(description="Monitor saved") +parser.add_argument('-c', '--client-id', help="Reddit client id", default=os.environ.get('CLIENT_ID', '')) +parser.add_argument('-s', '--client-secret', help="Reddit client secret", default=os.environ.get('CLIENT_SECRET', '')) +parser.add_argument('-u', '--reddit-username', help="Reddit username", default=os.environ.get('REDDIT_USERNAME', '')) +parser.add_argument('-p', '--reddit-password', help="Reddit user password", default=os.environ.get('REDDIT_PASSWORD', '')) +parser.add_argument('-P', '--nextcloud-password', help="Nextcloud Password", default=os.environ.get('NEXTCLOUD_PASSWORD', '')) +parser.add_argument('-U', '--nextcloud-username', help="Nextcloud Username", default=os.environ.get('NEXTCLOUD_USERNAME', '')) +parser.add_argument('-o', '--nextcloud-host', help="Nextcloud Host", default=os.environ.get('NEXTCLOUD_HOST', 'localhost')) + + +def try_post(url, json_string, count=0): +    try: +        if count > 10: +            return +        r = requests.post(url, data=json_string, headers=headers) +        if r.status_code != 200: +            sleep(60 * count) +            try_post(url, json_string, count + 1) +    except: +        sleep(60 * count) +        try_post(url, json_string, count + 1) + + +def jsonfy(post): +    json_string = jsonpickle.encode(post) +    json_dict: Dict = json.loads(json_string) +    json_dict.pop('_reddit') +    json_dict.pop('py/object') +    return json.dumps(json_dict) | 
