Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package you-get for openSUSE:Factory checked in at 2022-12-12 17:39:26 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/you-get (Old) and /work/SRC/openSUSE:Factory/.you-get.new.1835 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "you-get" Mon Dec 12 17:39:26 2022 rev:45 rq:1042216 version:0.4.1650 Changes: -------- --- /work/SRC/openSUSE:Factory/you-get/you-get.changes 2022-07-02 15:34:46.903039312 +0200 +++ /work/SRC/openSUSE:Factory/.you-get.new.1835/you-get.changes 2022-12-12 17:41:31.677735071 +0100 @@ -1,0 +2,8 @@ +Sun Dec 11 21:35:31 UTC 2022 - Luigi Baldoni <aloi...@gmx.com> + +- Update to version 0.4.1650 + * Twitter: support tweets with multiple videos and NSFW tweets. + * TikTok: fix extraction. + * YouTube: improve extractor. + +------------------------------------------------------------------- @@ -4 +12,4 @@ -- Update to version 0.4.1612 (no changelog) +- Update to version 0.4.1620 + * Instagram: fix extraction. + * YouTube: fix extraction. + * set the oldest supported python version to 3.7.4 Old: ---- you-get-0.4.1620.tar.gz New: ---- you-get-0.4.1650.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ you-get.spec ++++++ --- /var/tmp/diff_new_pack.pEM9LX/_old 2022-12-12 17:41:32.217738107 +0100 +++ /var/tmp/diff_new_pack.pEM9LX/_new 2022-12-12 17:41:32.221738130 +0100 @@ -17,7 +17,7 @@ Name: you-get -Version: 0.4.1620 +Version: 0.4.1650 Release: 0 Summary: Dumb downloader that scrapes the web License: MIT ++++++ you-get-0.4.1620.tar.gz -> you-get-0.4.1650.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/.github/workflows/python-package.yml new/you-get-0.4.1650/.github/workflows/python-package.yml --- old/you-get-0.4.1620/.github/workflows/python-package.yml 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/.github/workflows/python-package.yml 2022-12-11 18:15:46.000000000 +0100 @@ -16,12 +16,12 @@ strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', 3.11-dev, pypy-3.8, pypy-3.9] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/common.py new/you-get-0.4.1650/src/you_get/common.py --- old/you-get-0.4.1620/src/you_get/common.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/common.py 2022-12-11 18:15:46.000000000 +0100 @@ -344,21 +344,24 @@ # an http.client implementation of get_content() # because urllib does not support "Connection: keep-alive" -def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0): +def getHttps(host, url, headers, debuglevel=0): import http.client conn = http.client.HTTPSConnection(host) conn.set_debuglevel(debuglevel) conn.request("GET", url, headers=headers) resp = conn.getresponse() + set_cookie = resp.getheader('set-cookie') data = resp.read() - if gzip: - data = ungzip(data) - if deflate: - data = undeflate(data) + try: + data = ungzip(data) # gzip + data = undeflate(data) # deflate + except: + pass - return str(data, encoding='utf-8') + conn.close() + return str(data, encoding='utf-8'), set_cookie # DEPRECATED in favor of get_content() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/bilibili.py new/you-get-0.4.1650/src/you_get/extractors/bilibili.py --- old/you-get-0.4.1620/src/you_get/extractors/bilibili.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/bilibili.py 2022-12-11 18:15:46.000000000 +0100 @@ -115,11 +115,15 @@ @staticmethod def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) + + @staticmethod + def bilibili_space_collection_api(mid, cid, pn=1, ps=30): + return 'https://api.bilibili.com/x/polymer/space/seasons_archives_list?mid=%s&season_id=%s&sort_reverse=false&page_num=%s&page_size=%s' % (mid, cid, pn, ps) @staticmethod def bilibili_series_archives_api(mid, sid, pn=1, ps=100): return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) - + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @@ -628,6 +632,8 @@ sort = 'space_channel' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url): sort = 'space_channel_series' + elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/collectiondetail\?.*sid=(\d+)', self.url): + sort = 'space_channel_collection' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url): sort = 'space_favlist' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url): @@ -745,6 +751,20 @@ api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) archives_info = json.loads(api_content) # TBD: channel of more than 100 videos + + epn, i = len(archives_info['data']['archives']), 0 + for video in archives_info['data']['archives']: + i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) + url = 'https://www.bilibili.com/video/av%s' % video['aid'] + self.__class__().download_playlist_by_url(url, **kwargs) + + elif sort == 'space_channel_collection': + m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/collectiondetail\?.*sid=(\d+)', self.url) + mid, sid = m.group(1), m.group(2) + api_url = self.bilibili_space_collection_api(mid, sid) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + # TBD: channel of more than 100 videos epn, i = len(archives_info['data']['archives']), 0 for video in archives_info['data']['archives']: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/douyin.py new/you-get-0.4.1650/src/you_get/extractors/douyin.py --- old/you-get-0.4.1620/src/you_get/extractors/douyin.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/douyin.py 2022-12-11 18:15:46.000000000 +0100 @@ -1,8 +1,6 @@ # coding=utf-8 -import re import json -from urllib.parse import unquote from ..common import ( url_size, @@ -11,25 +9,52 @@ fake_headers, download_urls, playlist_not_supported, + match1, + get_location, ) - __all__ = ['douyin_download_by_url'] +def get_value(source: dict, path): + try: + value = source + for key in path: + if type(key) is str: + if key in value.keys(): + value = value[key] + else: + value = None + break + elif type(key) is int: + if len(value) != 0: + value = value[key] + else: + value = None + break + except: + value = None + return value + + def douyin_download_by_url(url, **kwargs): + # if short link, get the real url + if 'v.douyin.com' in url: + url = get_location(url) + aweme_id = match1(url, r'/(\d+)/?') + # get video info + video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}' + url = video_info_api.format(aweme_id) page_content = get_content(url, headers=fake_headers) - # The video player and video source are rendered client-side, the data - # contains in a <script id="RENDER_DATA" type="application/json"> tag - # quoted, unquote the whole page content then search using regex with - # regular string. - page_content = unquote(page_content) - title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip() + video_info = json.loads(page_content) + + # get video id and title + video_id = get_value(video_info, ['item_list', 0, 'video', 'vid']) + title = get_value(video_info, ['item_list', 0, 'desc']) + + # get video play url + video_url = "https://aweme.snssdk.com/aweme/v1/play/?ratio=720p&line=0&video_id={}".format(video_id) video_format = 'mp4' - # video URLs are in this pattern {"src":"THE_URL"}, in json format - urls_pattern = r'"playAddr":(\[.*?\])' - urls = json.loads(re.findall(urls_pattern, page_content)[0]) - video_url = 'https:' + urls[0]['src'] size = url_size(video_url, faker=True) print_info( site_info='douyin.com', title=title, diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/instagram.py new/you-get-0.4.1650/src/you_get/extractors/instagram.py --- old/you-get-0.4.1620/src/you_get/extractors/instagram.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/instagram.py 2022-12-11 18:15:46.000000000 +0100 @@ -19,9 +19,9 @@ api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id try: api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + post = json.loads(api_cont) except: log.wtf('[Error] Please specify a cookie file.') - post = json.loads(api_cont) for item in post['items']: code = item['code'] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/iqiyi.py new/you-get-0.4.1650/src/you_get/extractors/iqiyi.py --- old/you-get-0.4.1620/src/you_get/extractors/iqiyi.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/iqiyi.py 2022-12-11 18:15:46.000000000 +0100 @@ -131,10 +131,10 @@ html = get_html(self.url) tvid = r1(r'#curid=(.+)_', self.url) or \ r1(r'tvid=([^&]+)', self.url) or \ - r1(r'data-player-tvid="([^"]+)"', html) or r1(r'tv(?:i|I)d=(.+?)\&', html) or r1(r'param\[\'tvid\'\]\s*=\s*"(.+?)"', html) + r1(r'data-player-tvid="([^"]+)"', html) or r1(r'tv(?:i|I)d=(\w+?)\&', html) or r1(r'param\[\'tvid\'\]\s*=\s*"(.+?)"', html) videoid = r1(r'#curid=.+_(.*)$', self.url) or \ r1(r'vid=([^&]+)', self.url) or \ - r1(r'data-player-videoid="([^"]+)"', html) or r1(r'vid=(.+?)\&', html) or r1(r'param\[\'vid\'\]\s*=\s*"(.+?)"', html) + r1(r'data-player-videoid="([^"]+)"', html) or r1(r'vid=(\w+?)\&', html) or r1(r'param\[\'vid\'\]\s*=\s*"(.+?)"', html) self.vid = (tvid, videoid) info_u = 'http://pcw-api.iqiyi.com/video/video/playervideoinfo?tvid=' + tvid json_res = get_content(info_u) @@ -203,8 +203,13 @@ # For legacy main() #Here's the change!! - download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False) - + # ffmpeg fails to parse. + # download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False) + #Here's the way works out + urls = general_m3u8_extractor(urls[0]) + # ffmpeg fail to convert the output video with mkv extension, due to sort of timestamp problem + download_urls(urls, self.title, 'mp4', 0, **kwargs) + if not kwargs['caption']: print('Skipping captions.') return diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/ixigua.py new/you-get-0.4.1650/src/you_get/extractors/ixigua.py --- old/you-get-0.4.1620/src/you_get/extractors/ixigua.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/ixigua.py 2022-12-11 18:15:46.000000000 +0100 @@ -95,6 +95,8 @@ def convertStreams(video_list, audio_url): streams = [] + if type(video_list) == dict: + video_list = video_list.values() for dynamic_video in video_list: streams.append({ 'file_id': dynamic_video['file_hash'], diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/miaopai.py new/you-get-0.4.1650/src/you_get/extractors/miaopai.py --- old/you-get-0.4.1620/src/you_get/extractors/miaopai.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/miaopai.py 2022-12-11 18:15:46.000000000 +0100 @@ -80,6 +80,8 @@ def miaopai_download_h5api(url, output_dir='.', merge=False, info_only=False, **kwargs): oid = match1(url, r'/show/(\d{4}:\w+)') + if oid is None: + oid = match1(url, r'\?fid=(\d{4}:\w+)') page = "/show/%s" % oid data_url = 'https://h5.video.weibo.com/api/component?%s' % parse.urlencode({ 'page': page @@ -156,6 +158,9 @@ if re.match(r'^http[s]://(.+\.)?weibo\.com/(tv/)?show/(\d{4}:\w+)', url): return miaopai_download_h5api(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs) + if re.match(r'^http[s]://(.+\.)?weibo\.com/show\?fid=(\d{4}:\w+)', url): + return miaopai_download_h5api(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs) + fid = match1(url, r'\?fid=(\d{4}:\w+)') if fid is not None: miaopai_download_by_fid(fid, output_dir, merge, info_only) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/missevan.py new/you-get-0.4.1650/src/you_get/extractors/missevan.py --- old/you-get-0.4.1620/src/you_get/extractors/missevan.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/missevan.py 2022-12-11 18:15:46.000000000 +0100 @@ -25,6 +25,7 @@ import json import os import re +import urllib.parse from ..common import get_content, urls_size, log, player, dry_run from ..extractor import VideoExtractor @@ -99,7 +100,8 @@ return stream.lower() in ('covers', 'coversmini') def get_file_extension(file_path, default=''): - _, suffix = os.path.splitext(file_path) + url_parse_result = urllib.parse.urlparse(file_path) + _, suffix = os.path.splitext(url_parse_result.path) if suffix: # remove dot suffix = suffix[1:] @@ -310,7 +312,7 @@ or kwargs.get('json_output'): for _, stream in self.streams.items(): - stream['size'] = urls_size(stream['src']) + stream['size'] = urls_size(stream['src'], faker=True) return # fetch size of the selected stream only @@ -319,7 +321,7 @@ stream = self.streams[stream_id] if 'size' not in stream: - stream['size'] = urls_size(stream['src']) + stream['size'] = urls_size(stream['src'], faker=True) def _get_content(self, url): return get_content(url, headers=self.__headers) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/netease.py new/you-get-0.4.1650/src/you_get/extractors/netease.py --- old/you-get-0.4.1620/src/you_get/extractors/netease.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/netease.py 2022-12-11 18:15:46.000000000 +0100 @@ -79,9 +79,14 @@ netease_song_download(j["program"]["mainSong"], output_dir=output_dir, info_only=info_only) elif "radio" in url: - j = loads(get_content("http://music.163.com/api/dj/program/byradio/?radioId=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) - for i in j['programs']: - netease_song_download(i["mainSong"],output_dir=output_dir, info_only=info_only) + offset = 0 + while True: + j = loads(get_content("http://music.163.com/api/dj/program/byradio/?radioId=%s&ids=[%s]&csrf_token=&offset=%d" % (rid, rid, offset), headers={"Referer": "http://music.163.com/"})) + for i in j['programs']: + netease_song_download(i["mainSong"], output_dir=output_dir, info_only=info_only) + if not j['more']: + break + offset += len(j['programs']) elif "mv" in url: j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/tiktok.py new/you-get-0.4.1650/src/you_get/extractors/tiktok.py --- old/you-get-0.4.1620/src/you_get/extractors/tiktok.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/tiktok.py 2022-12-11 18:15:46.000000000 +0100 @@ -9,22 +9,23 @@ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', + 'Referer': 'https://www.tiktok.com/', 'Connection': 'keep-alive' # important } m = re.match('(https?://)?([^/]+)(/.*)', url) host = m.group(2) if host != 'www.tiktok.com': # non-canonical URL - html = getHttps(host, url, headers=headers, gzip=False) - url = r1(r'(https://www.tiktok.com/[^?"]+)', html) - # use canonical URL - m = re.match('(https?://)?([^/]+)(/.*)', url) - host = m.group(2) + vid = r1(r'/video/(\d+)', url) + url = 'https://www.tiktok.com/@/video/%s/' % vid + host = 'www.tiktok.com' + else: + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers - url = m.group(3).split('?')[0] - vid = url.split('/')[3] # should be a string of numbers - - html = getHttps(host, url, headers=headers) + html, set_cookie = getHttps(host, url, headers=headers) + tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie) + headers['Cookie'] = 'tt_chain_token=%s' % tt_chain_token data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ r1(r'<script id="SIGI_STATE" type="application/json">(.*?)</script>', html) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/twitter.py new/you-get-0.4.1650/src/you_get/extractors/twitter.py --- old/you-get-0.4.1620/src/you_get/extractors/twitter.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/twitter.py 2022-12-11 18:15:46.000000000 +0100 @@ -41,57 +41,83 @@ r1(r'<meta name="twitter:site:id" content="([^"]*)"', html) page_title = "{} [{}]".format(screen_name, item_id) - authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + try: + authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' - ga_url = 'https://api.twitter.com/1.1/guest/activate.json' - ga_content = post_content(ga_url, headers={'authorization': authorization}) - guest_token = json.loads(ga_content)['guest_token'] - - api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id - api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) - - info = json.loads(api_content) - if item_id not in info['globalObjects']['tweets']: - # something wrong here - log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) - return - - elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: - # if the tweet contains media, download them - media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] - - elif info['globalObjects']['tweets'][item_id].get('is_quote_status') == True: - # if the tweet does not contain media, but it quotes a tweet - # and the quoted tweet contains media, download them - item_id = info['globalObjects']['tweets'][item_id]['quoted_status_id_str'] + # FIXME: 403 with cookies + ga_url = 'https://api.twitter.com/1.1/guest/activate.json' + ga_content = post_content(ga_url, headers={'authorization': authorization}) + guest_token = json.loads(ga_content)['guest_token'] api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) + if item_id not in info['globalObjects']['tweets']: + # something wrong here + #log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) + assert False - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: + # if the tweet contains media, download them media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] + + elif 'entities' in info['globalObjects']['tweets'][item_id]: + # if the tweet contains media from another tweet, download it + expanded_url = None + for j in info['globalObjects']['tweets'][item_id]['entities']['urls']: + if re.match(r'^https://twitter.com/.*', j['expanded_url']): + # FIXME: multiple valid expanded_url's? + expanded_url = j['expanded_url'] + if expanded_url is not None: + item_id = r1(r'/status/(\d+)', expanded_url) + assert False + + elif info['globalObjects']['tweets'][item_id].get('is_quote_status') == True: + # if the tweet does not contain media, but it quotes a tweet + # and the quoted tweet contains media, download them + item_id = info['globalObjects']['tweets'][item_id]['quoted_status_id_str'] + + api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id + api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) + + info = json.loads(api_content) + + if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] + else: + # quoted tweet has no media + return + else: - # quoted tweet has no media + # no media, no quoted tweet return - else: - # no media, no quoted tweet - return + except: + authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw' + + # FIXME: 403 with cookies + ga_url = 'https://api.twitter.com/1.1/guest/activate.json' + ga_content = post_content(ga_url, headers={'authorization': authorization}) + guest_token = json.loads(ga_content)['guest_token'] + + api_url = 'https://api.twitter.com/1.1/statuses/show/%s.json?tweet_mode=extended' % item_id + api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) + info = json.loads(api_content) + media = info['extended_entities']['media'] for medium in media: if 'video_info' in medium: - # FIXME: we're assuming one tweet only contains one video here variants = medium['video_info']['variants'] variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0)) + title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0] urls = [ variants[-1]['url'] ] size = urls_size(urls) mime, ext = variants[-1]['content_type'], 'mp4' - print_info(site_info, page_title, mime, size) + print_info(site_info, title, mime, size) if not info_only: - download_urls(urls, page_title, ext, size, output_dir, merge=merge) + download_urls(urls, title, ext, size, output_dir, merge=merge) else: title = item_id + '_' + medium['media_url_https'].split('.')[-2].split('/')[-1] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/youku.py new/you-get-0.4.1650/src/you_get/extractors/youku.py --- old/you-get-0.4.1620/src/you_get/extractors/youku.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/youku.py 2022-12-11 18:15:46.000000000 +0100 @@ -77,7 +77,7 @@ self.api_error_code = None self.api_error_msg = None - self.ccode = '0532' + self.ccode = '0564' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/extractors/youtube.py new/you-get-0.4.1650/src/you_get/extractors/youtube.py --- old/you-get-0.4.1620/src/you_get/extractors/youtube.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/extractors/youtube.py 2022-12-11 18:15:46.000000000 +0100 @@ -79,6 +79,7 @@ # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js # - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js + # - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js def tr_js(code): code = re.sub(r'function', r'def', code) # add prefix '_sig_' to prevent namespace pollution @@ -114,14 +115,10 @@ else: f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) - f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2) - f2 = re.sub(r'\$', '_dollar', f2) + f2 = re.sub(r'\$', '_dollar', f2) # replace dollar sign code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) - # if f1 contains more than 2 characters, no need to do substitution - # FIXME: we probably shouldn't do any substitution here at all? - f1 = re.sub(r'^(as|if|in|is|or)$', r'_\1', f1) - f1 = re.sub(r'\$', '_dollar', f1) + f1 = re.sub(r'\$', '_dollar', f1) # replace dollar sign code = code + '_sig=_sig_%s(s)' % f1 exec(code, globals(), locals()) return locals()['_sig'] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/src/you_get/version.py new/you-get-0.4.1650/src/you_get/version.py --- old/you-get-0.4.1620/src/you_get/version.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/src/you_get/version.py 2022-12-11 18:15:46.000000000 +0100 @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1620' +__version__ = '0.4.1650' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/you-get-0.4.1620/tests/test.py new/you-get-0.4.1650/tests/test.py --- old/you-get-0.4.1620/tests/test.py 2022-07-01 23:26:50.000000000 +0200 +++ new/you-get-0.4.1650/tests/test.py 2022-12-11 18:15:46.000000000 +0100 @@ -11,7 +11,8 @@ bilibili, soundcloud, tiktok, - twitter + twitter, + miaopai ) @@ -56,11 +57,14 @@ def test_tiktok(self): tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) def test_twitter(self): twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) + def test_weibo(self): + miaopai.download('https://video.weibo.com/show?fid=1034:4825403706245135', info_only=True) if __name__ == '__main__': unittest.main()