Upgrade Algolia search to v3 (#3560)

* update dapr publish command Signed-off-by: Hannah Hunter <hannahhunter@microsoft.com> Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Split workflow into two steps Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Update upload path Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Add concurrency check Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Add Algolia workflow script and step Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Update Algolia box to v3 Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Fix secret name Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Override default search bar in Docsy v3 Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Remove temporary comment Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> * Consolidate build and deploy Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> --------- Signed-off-by: Hannah Hunter <hannahhunter@microsoft.com> Signed-off-by: Aaron Crawfis <Aaron.Crawfis@microsoft.com> Co-authored-by: Hannah Hunter <hannahhunter@microsoft.com> Co-authored-by: Mark Fussell <markfussell@gmail.com>
2023-06-16 21:18:24 -07:00 · 2023-06-16 21:18:24 -07:00 · 0d0d29ac92
parent b9759702d5
commit 0d0d29ac92
5 changed files with 182 additions and 14 deletions
--- a/.github/scripts/algolia.py
+++ b/.github/scripts/algolia.py
@ -0,0 +1,118 @@
 import os
 from re import S
 import sys
 import json
 from bs4 import BeautifulSoup
 from algoliasearch.search_client import SearchClient
 url = "docs.dapr.io"
 if len(sys.argv) > 1:
    starting_directory = os.path.join(os.getcwd(), str(sys.argv[1])) 
 else:
    starting_directory = os.getcwd()
 ALGOLIA_APP_ID = os.getenv('ALGOLIA_APP_ID')
 ALGOLIA_API_KEY = os.getenv('ALGOLIA_API_WRITE_KEY')
 ALGOLIA_INDEX_NAME = os.getenv('ALGOLIA_INDEX_NAME')
 client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
 index = client.init_index(ALGOLIA_INDEX_NAME)
 excluded_files = [
    "404.html",
 ]
 exluded_directories = [
    "zh-hans",
 ]
 rankings = {
    "Getting started": 0,
    "Concepts": 100,
    "Developing applications": 200,
    "Operations": 300,
    "Reference": 400,
    "Contributing": 500,
    "Home": 600
 }
 def scan_directory(directory: str, pages: list):
    if os.path.basename(directory) in exluded_directories:
        print(f'Skipping directory: {directory}')
        return
    for file in os.listdir(directory):
        path = os.path.join(directory, file)
        if os.path.isfile(path):
            if file.endswith(".html") and file not in excluded_files:
                if '<!-- DISABLE_ALGOLIA -->' not in open(path, encoding="utf8").read():
                    print(f'Indexing: {path}')
                    pages.append(path)
                else:
                    print(f'Skipping hidden page: {path}')
        else:
            scan_directory(path, pages)
 def parse_file(path: str):
    data = {}
    data["hierarchy"] = {}
    data["rank"] = 999
    data["subrank"] = 99
    data["type"] = "lvl2"
    data["lvl0"] = ""
    data["lvl1"] = ""
    data["lvl2"] = ""
    data["lvl3"] = ""
    text = ""
    subrank = 0
    with open(path, "r", errors='ignore') as file:
        content = file.read()
        soup = BeautifulSoup(content, "html.parser")
    for meta in soup.find_all("meta"):
        if meta.get("name") == "description":
            data["lvl2"] = meta.get("content")
            data["hierarchy"]["lvl1"] = meta.get("content")
        elif meta.get("property") == "og:title":
            data["lvl0"] = meta.get("content")
            data["hierarchy"]["lvl0"] = meta.get("content")
            data["hierarchy"]["lvl2"] = meta.get("content")
        elif meta.get("property") == "og:url":
            data["url"] = meta.get("content")
            data["path"] = meta.get("content").split(url)[1]
            data["objectID"] = meta.get("content").split(url)[1]
    breadcrumbs = soup.find_all("li", class_="breadcrumb-item")
    try:
        subrank = len(breadcrumbs)
        data["subrank"] = subrank
    except:
        subrank = 99
        data["subrank"] = 99
    for bc in breadcrumbs:
        section = bc.text.strip()
        data["lvl1"] = section
        data["hierarchy"]["lvl0"] = section
        try:
            data["rank"] = rankings[section] + subrank
        except:
            print(f"Rank not found for section {section}")
            data["rank"] = 998
        break
    for p in soup.find_all("p"):
        if p.text != "":
            text = text + p.text
    data["text"] = text
    return data
 def index_payload(payload):
    res = index.replace_all_objects(payload)
    res.wait()
 if __name__ == "__main__":
    pages = []
    payload = []
    scan_directory(starting_directory, pages)
    for page in pages:
        data = parse_file(page)
        if "objectID" in data:
            payload.append(data)
    index_payload(payload)
--- a/.github/workflows/website-root.yml
+++ b/.github/workflows/website-root.yml
@ -79,3 +79,29 @@ jobs:
        with:
          azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_PROUD_BAY_0E9E0E81E }}
          action: "close"
  algolia_index:
    name: Index site for Algolia
    if: github.event_name == 'push'
    needs: ['build_and_deploy_job']
    runs-on: ubuntu-latest
    env:
      ALGOLIA_APP_ID: ${{ secrets.ALGOLIA_APP_ID }}
      ALGOLIA_API_WRITE_KEY: ${{ secrets.ALGOLIA_API_WRITE_KEY }}
      ALGOLIA_INDEX_NAME: daprdocs
    steps:
      - name: Checkout docs repo
        uses: actions/checkout@v2
        with:
          submodules: false
      - name: Download Hugo artifacts
        uses: actions/download-artifact@v3
        with:
          name: hugo_build
          path: site/
      - name: Install Python packages
        run: |
          pip install --upgrade bs4
          pip install --upgrade 'algoliasearch>=2.0,<3.0'
      - name: Index site
        run: python ./.github/scripts/algolia.py ./site
--- a/daprdocs/layouts/partials/hooks/body-end.html
+++ b/daprdocs/layouts/partials/hooks/body-end.html
@ -1,19 +1,13 @@
 <script src="/js/copy-code-button.js"></script>
 {{ with .Site.Params.algolia_docsearch }}
-<script src="https://cdn.jsdelivr.net/npm/docsearch.js@2.6.3/dist/cdn/docsearch.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/@docsearch/js@3"></script>
-<script>
+<script type="text/javascript">
  docsearch({
-    // Your apiKey and indexName will be given to you once
+    container: '#docsearch',
    // we create your config
    apiKey: '54ae43aa28ce8f00c54c8d5f544d29b9',
    indexName: 'crawler_dapr',
    appId: 'O0QLQGNF38',
-    // Replace inputSelector with a CSS selector
+    apiKey: '54ae43aa28ce8f00c54c8d5f544d29b9',
-    // matching your search input
+    indexName: 'daprdocs',
    inputSelector: '.td-search-input',
    // Set debug to true to inspect the dropdown
    debug: false,
  });
 </script>
 {{ end }}
 <script src="/js/copy-code-button.js"></script>
--- a/daprdocs/layouts/partials/hooks/head-end.html
+++ b/daprdocs/layouts/partials/hooks/head-end.html
@ -1,3 +1,3 @@
 {{ with .Site.Params.algolia_docsearch }}
-<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@docsearch/css@3" />
 {{ end }}
--- a/daprdocs/layouts/partials/search-input.html
+++ b/daprdocs/layouts/partials/search-input.html
@ -0,0 +1,30 @@
 {{ if .Site.Params.gcs_engine_id -}}
 <input type="search" class="form-control td-search-input" placeholder="&#xf002; {{ T "ui_search" }}" aria-label="{{ T "ui_search" }}" autocomplete="off">
 {{ else if .Site.Params.algolia_docsearch -}}
 <div id="docsearch"></div>
 {{ else if .Site.Params.offlineSearch -}}
 {{ $offlineSearchIndex := resources.Get "json/offline-search-index.json" | resources.ExecuteAsTemplate "offline-search-index.json" . -}}
 {{ if hugo.IsProduction -}}
 {{/* Use `md5` as finger print hash function to shorten file name to avoid `file name too long` error. */ -}}
 {{ $offlineSearchIndex = $offlineSearchIndex | fingerprint "md5" -}}
 {{ end -}}
 {{ $offlineSearchLink := $offlineSearchIndex.RelPermalink -}}
 <input
  type="search"
  class="form-control td-search-input"
  placeholder="&#xf002; {{ T "ui_search" }}"
  aria-label="{{ T "ui_search" }}"
  autocomplete="off"
  {{/*
    The data attribute name of the json file URL must end with `src` since
    Hugo's absurlreplacer requires `src`, `href`, `action` or `srcset` suffix for the attribute name.
    If the absurlreplacer is not applied, the URL will start with `/`.
    It causes the json file loading error when when relativeURLs is enabled.
    https://github.com/google/docsy/issues/181
  */}}
  data-offline-search-index-json-src="{{ $offlineSearchLink }}"
  data-offline-search-base-href="/"
  data-offline-search-max-results="{{ .Site.Params.offlineSearchMaxResults | default 10 }}"
 >
 {{ end -}}