From c52c606e6ae09291378c6eae17938d642dc2e463 Mon Sep 17 00:00:00 2001 From: Johannes Christ Date: Sun, 11 Jul 2021 22:56:17 +0200 Subject: Improvements & fixes to homepage repository display. These changes mainly aim to prevent multiple concurrent requests from deadlocking the table due to the lazy evaluation imposed by `objects.all()`, and introduce some quality of life changes whilst doing so. To prevent unwanted locks, the following is done: - Instead of evaluating the first item of `objects.all()` only in order to determine when the table was most recently updated, we query for the least recent updated entry and use that instead. As we use `.first()`, the model instance is loaded directly. The following quality of life changes are introduced: - Instead of manual "update or create" logic for repositories retrieved from the API, use Django's `update_or_create` instead. - Instead of manual bulk creation of entries on initial data retrieve, Django's `bulk_create` method is used instead. - To allow for local testing of this endpoint without having to manually set up GitHub REST authorization, anonymous GitHub access is used when no `GITHUB_TOKEN` is configured. --- pydis_site/apps/home/views/home.py | 62 +++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'pydis_site/apps/home/views/home.py') diff --git a/pydis_site/apps/home/views/home.py b/pydis_site/apps/home/views/home.py index b3767d37..23b6ab24 100644 --- a/pydis_site/apps/home/views/home.py +++ b/pydis_site/apps/home/views/home.py @@ -19,7 +19,6 @@ class HomeView(View): github_api = "https://api.github.com/users/python-discord/repos?per_page=100" repository_cache_ttl = 3600 - headers = {"Authorization": f"token {GITHUB_TOKEN}"} # Which of our GitHub repos should be displayed on the front page, and in which order? repos = [ @@ -35,6 +34,16 @@ class HomeView(View): """Clean up stale RepositoryMetadata.""" RepositoryMetadata.objects.exclude(repo_name__in=self.repos).delete() + # If no token is defined (for example in local development), then + # it does not make sense to pass the Authorization header. More + # specifically, GitHub will reject any requests from us due to the + # invalid header. We can make a limited number of anonymous requests + # though, which is useful for testing. + if GITHUB_TOKEN: + self.headers = {"Authorization": f"token {GITHUB_TOKEN}"} + else: + self.headers = {} + def _get_api_data(self) -> Dict[str, Dict[str, str]]: """ Call the GitHub API and get information about our repos. @@ -74,35 +83,30 @@ class HomeView(View): def _get_repo_data(self) -> List[RepositoryMetadata]: """Build a list of RepositoryMetadata objects that we can use to populate the front page.""" - database_repositories = [] - - # First, let's see if we have any metadata cached. - cached_data = RepositoryMetadata.objects.all() + # First off, load the timestamp of the least recently updated entry. + oldest_entry = RepositoryMetadata.objects.order_by("last_updated").first() - # If we don't, we have to create some! - if not cached_data: + # If we did not retrieve any results here, we should import them! + if oldest_entry is None: # Try to get new data from the API. If it fails, we'll return an empty list. # In this case, we simply don't display our projects on the site. api_repositories = self._get_api_data() # Create all the repodata records in the database. - for api_data in api_repositories.values(): - repo_data = RepositoryMetadata( + return RepositoryMetadata.objects.bulk_create( + RepositoryMetadata( repo_name=api_data["full_name"], description=api_data["description"], forks=api_data["forks_count"], stargazers=api_data["stargazers_count"], language=api_data["language"], ) - - repo_data.save() - database_repositories.append(repo_data) - - return database_repositories + for api_data in api_repositories.values() + ) # If the data is stale, we should refresh it. - if (timezone.now() - cached_data[0].last_updated).seconds > self.repository_cache_ttl: + if (timezone.now() - oldest_entry.last_updated).seconds > self.repository_cache_ttl: # Try to get new data from the API. If it fails, return the cached data. api_repositories = self._get_api_data() @@ -110,22 +114,18 @@ class HomeView(View): return RepositoryMetadata.objects.all() # Update or create all RepoData objects in self.repos - for repo_name, api_data in api_repositories.items(): - try: - repo_data = RepositoryMetadata.objects.get(repo_name=repo_name) - repo_data.description = api_data["description"] - repo_data.language = api_data["language"] - repo_data.forks = api_data["forks_count"] - repo_data.stargazers = api_data["stargazers_count"] - except RepositoryMetadata.DoesNotExist: - repo_data = RepositoryMetadata( - repo_name=api_data["full_name"], - description=api_data["description"], - forks=api_data["forks_count"], - stargazers=api_data["stargazers_count"], - language=api_data["language"], - ) - repo_data.save() + database_repositories = [] + for api_data in api_repositories.values(): + repo_data, _created = RepositoryMetadata.objects.update_or_create( + repo_name=api_data["full_name"], + defaults={ + 'repo_name': api_data["full_name"], + 'description': api_data["description"], + 'forks': api_data["forks_count"], + 'stargazers': api_data["stargazers_count"], + 'language': api_data["language"], + } + ) database_repositories.append(repo_data) return database_repositories -- cgit v1.2.3 From 3a1f4230cc9f660ff118f9bbf5591ea8aa87ef07 Mon Sep 17 00:00:00 2001 From: Johannes Christ Date: Thu, 15 Jul 2021 20:44:52 +0200 Subject: Address review comment. --- pydis_site/apps/home/views/home.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'pydis_site/apps/home/views/home.py') diff --git a/pydis_site/apps/home/views/home.py b/pydis_site/apps/home/views/home.py index 23b6ab24..0f26cef3 100644 --- a/pydis_site/apps/home/views/home.py +++ b/pydis_site/apps/home/views/home.py @@ -84,10 +84,13 @@ class HomeView(View): def _get_repo_data(self) -> List[RepositoryMetadata]: """Build a list of RepositoryMetadata objects that we can use to populate the front page.""" # First off, load the timestamp of the least recently updated entry. - oldest_entry = RepositoryMetadata.objects.order_by("last_updated").first() + last_update = ( + RepositoryMetadata.objects.values_list("last_updated", flat=True) + .order_by("last_updated").first() + ) # If we did not retrieve any results here, we should import them! - if oldest_entry is None: + if last_update is None: # Try to get new data from the API. If it fails, we'll return an empty list. # In this case, we simply don't display our projects on the site. @@ -106,7 +109,7 @@ class HomeView(View): ) # If the data is stale, we should refresh it. - if (timezone.now() - oldest_entry.last_updated).seconds > self.repository_cache_ttl: + if (timezone.now() - last_update).seconds > self.repository_cache_ttl: # Try to get new data from the API. If it fails, return the cached data. api_repositories = self._get_api_data() -- cgit v1.2.3 From fa68a1063d4de205cb34c1f17eda62926b23b9db Mon Sep 17 00:00:00 2001 From: Chris Lovering Date: Fri, 3 Sep 2021 10:40:16 +0100 Subject: Timeout when fetching GitHub repository metadata Not having this timeout could cause a worker to hang indefinitely --- pydis_site/apps/home/views/home.py | 15 +++++++++++---- pydis_site/constants.py | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'pydis_site/apps/home/views/home.py') diff --git a/pydis_site/apps/home/views/home.py b/pydis_site/apps/home/views/home.py index 0f26cef3..bbb4b815 100644 --- a/pydis_site/apps/home/views/home.py +++ b/pydis_site/apps/home/views/home.py @@ -9,7 +9,7 @@ from django.utils import timezone from django.views import View from pydis_site.apps.home.models import RepositoryMetadata -from pydis_site.constants import GITHUB_TOKEN +from pydis_site.constants import GITHUB_TOKEN, TIMEOUT_PERIOD log = logging.getLogger(__name__) @@ -51,9 +51,16 @@ class HomeView(View): If we're unable to get that info for any reason, return an empty dict. """ repo_dict = {} - - # Fetch the data from the GitHub API - api_data: List[dict] = requests.get(self.github_api, headers=self.headers).json() + try: + # Fetch the data from the GitHub API + api_data: List[dict] = requests.get( + self.github_api, + headers=self.headers, + timeout=TIMEOUT_PERIOD + ).json() + except requests.exceptions.Timeout: + log.error("Request to fetch GitHub repository metadata for timed out!") + return repo_dict # Process the API data into our dict for repo in api_data: diff --git a/pydis_site/constants.py b/pydis_site/constants.py index e6a63d12..e913f40f 100644 --- a/pydis_site/constants.py +++ b/pydis_site/constants.py @@ -2,3 +2,5 @@ import os GIT_SHA = os.environ.get("GIT_SHA", "development") GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") +# How long to wait for synchronous requests before timing out +TIMEOUT_PERIOD = int(os.environ.get("TIMEOUT_PERIOD", 5)) -- cgit v1.2.3