docs/.github/scripts/algolia.py

import os
from re import S
import sys
import json
from bs4 import BeautifulSoup
from algoliasearch.search_client import SearchClient

url = "docs.dapr.io"
if len(sys.argv) > 1:
    starting_directory = os.path.join(os.getcwd(), str(sys.argv[1]))
else:
    starting_directory = os.getcwd()

ALGOLIA_APP_ID = os.getenv('ALGOLIA_APP_ID')
ALGOLIA_API_KEY = os.getenv('ALGOLIA_API_WRITE_KEY')
ALGOLIA_INDEX_NAME = os.getenv('ALGOLIA_INDEX_NAME')

client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
index = client.init_index(ALGOLIA_INDEX_NAME)

excluded_files = [
    "404.html",
]

exluded_directories = [
    "zh-hans",
]

rankings = {
    "Getting started": 0,
    "Concepts": 100,
    "Developing applications": 200,
    "Operations": 300,
    "Reference": 400,
    "Contributing": 500,
    "Home": 600
}

def scan_directory(directory: str, pages: list):
    if os.path.basename(directory) in exluded_directories:
        print(f'Skipping directory: {directory}')
        return
    for file in os.listdir(directory):
        path = os.path.join(directory, file)
        if os.path.isfile(path):
            if file.endswith(".html") and file not in excluded_files:
                if '<!-- DISABLE_ALGOLIA -->' not in open(path, encoding="utf8").read():
                    print(f'Indexing: {path}')
                    pages.append(path)
                else:
                    print(f'Skipping hidden page: {path}')
        else:
            scan_directory(path, pages)

def parse_file(path: str):
    data = {}
    data["hierarchy"] = {}
    data["rank"] = 999
    data["subrank"] = 99
    data["type"] = "lvl2"
    data["lvl0"] = ""
    data["lvl1"] = ""
    data["lvl2"] = ""
    data["lvl3"] = ""
    text = ""
    subrank = 0
    with open(path, "r", errors='ignore') as file:
        content = file.read()
        soup = BeautifulSoup(content, "html.parser")
    for meta in soup.find_all("meta"):
        if meta.get("name") == "description":
            data["lvl2"] = meta.get("content")
            data["hierarchy"]["lvl1"] = meta.get("content")
        elif meta.get("property") == "og:title":
            data["lvl0"] = meta.get("content")
            data["hierarchy"]["lvl0"] = meta.get("content")
            data["hierarchy"]["lvl2"] = meta.get("content")
        elif meta.get("property") == "og:url":
            data["url"] = meta.get("content")
            data["path"] = meta.get("content").split(url)[1]
            data["objectID"] = meta.get("content").split(url)[1]
    breadcrumbs = soup.find_all("li", class_="breadcrumb-item")
    try:
        subrank = len(breadcrumbs)
        data["subrank"] = subrank
    except:
        subrank = 99
        data["subrank"] = 99
    for bc in breadcrumbs:
        section = bc.text.strip()
        data["lvl1"] = section
        data["hierarchy"]["lvl0"] = section
        try:
            data["rank"] = rankings[section] + subrank
        except:
            print(f"Rank not found for section {section}")
            data["rank"] = 998
        break
    for p in soup.find_all("p"):
        if p.text != "":
            text = text + p.text
    data["text"] = text
    return data

def index_payload(payload):
    res = index.replace_all_objects(payload)
    res.wait()


if __name__ == "__main__":
    pages = []
    payload = []
    scan_directory(starting_directory, pages)
    for page in pages:
        data = parse_file(page)
        if "objectID" in data:
            payload.append(data)
    index_payload(payload)