From f230d91b437c806e3e2dad37318a5ce77d208fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 5 Dec 2019 21:24:57 +0100 Subject: [PATCH] check the validity of links We add a script and a CI workflow to check that all download links present in the source code are valid. --- .circleci/config.yml | 11 ++++++ utils/link_tester.py | 79 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 utils/link_tester.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 01e6d82b33..ebfbd79b93 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -82,6 +82,16 @@ jobs: - run: sudo pip install --progress-bar off -r docs/requirements.txt - run: sudo pip install --progress-bar off -r requirements.txt - run: ./.circleci/deploy.sh + repository_consistency: + working_directory: ~/transformers + docker: + - image: circleci/python:3.5 + resource_class: small + parallelism: 1 + steps: + - checkout + - run: sudo pip install requests + - run: python ./utils/link_tester.py workflow_filters: &workflow_filters filters: branches: @@ -91,6 +101,7 @@ workflows: version: 2 build_and_test: jobs: + - repository_consistency - build_py3_torch_and_tf - build_py3_torch - build_py3_tf diff --git a/utils/link_tester.py b/utils/link_tester.py new file mode 100644 index 0000000000..fe3990d28c --- /dev/null +++ b/utils/link_tester.py @@ -0,0 +1,79 @@ +""" Link tester. + +This little utility reads all the python files in the repository, +scans for links pointing to S3 and tests the links one by one. Raises an error +at the end of the scan if at least one link was reported broken. +""" +import os +import re +import sys + +import requests + + +REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1""" + + +def list_python_files_in_repository(): + """ List all python files in the repository. + + This function assumes that the script is executed in the root folder. + """ + source_code_files = [] + for path, subdirs, files in os.walk("."): + if "templates" in path: + continue + for name in files: + if ".py" in name and ".pyc" not in name: + path_to_files = os.path.join(path, name) + source_code_files.append(path_to_files) + + return source_code_files + + +def find_all_links(file_paths): + links = [] + for path in file_paths: + links += scan_code_for_links(path) + + return links + + +def scan_code_for_links(source): + """ Scans the file to find links using a regular expression. + Returns a list of links. + """ + with open(source, 'r') as content: + content = content.read() + raw_links = re.findall(REGEXP_FIND_S3_LINKS, content) + links = [prefix + suffix for _, prefix, suffix in raw_links] + + return links + + +def check_all_links(links): + """ Check that the provided links are valid. + + Links are considered valid if a HEAD request to the server + returns a 200 status code. + """ + broken_links = [] + for link in links: + head = requests.head(link) + if head.status_code != 200: + broken_links.append(link) + + return broken_links + + +if __name__ == "__main__": + file_paths = list_python_files_in_repository() + links = find_all_links(file_paths) + broken_links = check_all_links(links) + print("Looking for broken links to pre-trained models/configs/tokenizers...") + if broken_links: + print("The following links did not respond:") + for link in broken_links: + print("- {}".format(link)) + sys.exit(1) + print("All links are ok.")