mirror of https://github.com/dapr/docs.git
119 lines
3.4 KiB
Python
119 lines
3.4 KiB
Python
import os
|
|
from re import S
|
|
import sys
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from algoliasearch.search_client import SearchClient
|
|
|
|
url = "docs.dapr.io"
|
|
if len(sys.argv) > 1:
|
|
starting_directory = os.path.join(os.getcwd(), str(sys.argv[1]))
|
|
else:
|
|
starting_directory = os.getcwd()
|
|
|
|
ALGOLIA_APP_ID = os.getenv('ALGOLIA_APP_ID')
|
|
ALGOLIA_API_KEY = os.getenv('ALGOLIA_API_WRITE_KEY')
|
|
ALGOLIA_INDEX_NAME = os.getenv('ALGOLIA_INDEX_NAME')
|
|
|
|
client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
|
|
index = client.init_index(ALGOLIA_INDEX_NAME)
|
|
|
|
excluded_files = [
|
|
"404.html",
|
|
]
|
|
|
|
exluded_directories = [
|
|
"zh-hans",
|
|
]
|
|
|
|
rankings = {
|
|
"Getting started": 0,
|
|
"Concepts": 100,
|
|
"Developing applications": 200,
|
|
"Operations": 300,
|
|
"Reference": 400,
|
|
"Contributing": 500,
|
|
"Home": 600
|
|
}
|
|
|
|
def scan_directory(directory: str, pages: list):
|
|
if os.path.basename(directory) in exluded_directories:
|
|
print(f'Skipping directory: {directory}')
|
|
return
|
|
for file in os.listdir(directory):
|
|
path = os.path.join(directory, file)
|
|
if os.path.isfile(path):
|
|
if file.endswith(".html") and file not in excluded_files:
|
|
if '<!-- DISABLE_ALGOLIA -->' not in open(path, encoding="utf8").read():
|
|
print(f'Indexing: {path}')
|
|
pages.append(path)
|
|
else:
|
|
print(f'Skipping hidden page: {path}')
|
|
else:
|
|
scan_directory(path, pages)
|
|
|
|
def parse_file(path: str):
|
|
data = {}
|
|
data["hierarchy"] = {}
|
|
data["rank"] = 999
|
|
data["subrank"] = 99
|
|
data["type"] = "lvl2"
|
|
data["lvl0"] = ""
|
|
data["lvl1"] = ""
|
|
data["lvl2"] = ""
|
|
data["lvl3"] = ""
|
|
text = ""
|
|
subrank = 0
|
|
with open(path, "r", errors='ignore') as file:
|
|
content = file.read()
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
for meta in soup.find_all("meta"):
|
|
if meta.get("name") == "description":
|
|
data["lvl2"] = meta.get("content")
|
|
data["hierarchy"]["lvl1"] = meta.get("content")
|
|
elif meta.get("property") == "og:title":
|
|
data["lvl0"] = meta.get("content")
|
|
data["hierarchy"]["lvl0"] = meta.get("content")
|
|
data["hierarchy"]["lvl2"] = meta.get("content")
|
|
elif meta.get("property") == "og:url":
|
|
data["url"] = meta.get("content")
|
|
data["path"] = meta.get("content").split(url)[1]
|
|
data["objectID"] = meta.get("content").split(url)[1]
|
|
breadcrumbs = soup.find_all("li", class_="breadcrumb-item")
|
|
try:
|
|
subrank = len(breadcrumbs)
|
|
data["subrank"] = subrank
|
|
except:
|
|
subrank = 99
|
|
data["subrank"] = 99
|
|
for bc in breadcrumbs:
|
|
section = bc.text.strip()
|
|
data["lvl1"] = section
|
|
data["hierarchy"]["lvl0"] = section
|
|
try:
|
|
data["rank"] = rankings[section] + subrank
|
|
except:
|
|
print(f"Rank not found for section {section}")
|
|
data["rank"] = 998
|
|
break
|
|
for p in soup.find_all("p"):
|
|
if p.text != "":
|
|
text = text + p.text
|
|
data["text"] = text
|
|
return data
|
|
|
|
def index_payload(payload):
|
|
res = index.replace_all_objects(payload)
|
|
res.wait()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pages = []
|
|
payload = []
|
|
scan_directory(starting_directory, pages)
|
|
for page in pages:
|
|
data = parse_file(page)
|
|
if "objectID" in data:
|
|
payload.append(data)
|
|
index_payload(payload)
|