Update cralwer to crawl all io.opentelemetry.* groups (#7316)
This commit is contained in:
parent
d70fe5b029
commit
54e5ea7bf2
|
@ -6,14 +6,20 @@
|
||||||
package io.opentelemetry.javadocs;
|
package io.opentelemetry.javadocs;
|
||||||
|
|
||||||
public class Artifact {
|
public class Artifact {
|
||||||
|
private final String group;
|
||||||
private final String name;
|
private final String name;
|
||||||
private final String version;
|
private final String version;
|
||||||
|
|
||||||
public Artifact(String name, String version) {
|
public Artifact(String group, String name, String version) {
|
||||||
|
this.group = group;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.version = version;
|
this.version = version;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getGroup() {
|
||||||
|
return group;
|
||||||
|
}
|
||||||
|
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
@ -21,4 +27,9 @@ public class Artifact {
|
||||||
public String getVersion() {
|
public String getVersion() {
|
||||||
return version;
|
return version;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return group + ":" + name + ":" + version;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,9 +15,11 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.logging.Level;
|
import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The javadoc.io site relies on someone accessing the page for an artifact version in order to
|
* The javadoc.io site relies on someone accessing the page for an artifact version in order to
|
||||||
|
@ -26,7 +28,16 @@ import java.util.logging.Logger;
|
||||||
* pages on the javadoc.io site to trigger updates.
|
* pages on the javadoc.io site to trigger updates.
|
||||||
*/
|
*/
|
||||||
public final class JavaDocsCrawler {
|
public final class JavaDocsCrawler {
|
||||||
private static final String GROUP = "io.opentelemetry";
|
// Track list of groups and the minimum artifact versions that should be crawled. Update to the
|
||||||
|
// latest periodically to avoid crawling artifacts that stopped being published.
|
||||||
|
private static final Map<String, String> GROUPS_AND_MIN_VERSION =
|
||||||
|
Map.of(
|
||||||
|
"io.opentelemetry", "1.49.0",
|
||||||
|
"io.opentelemetry.instrumentation", "2.15.0",
|
||||||
|
"io.opentelemetry.contrib", "1.46.0",
|
||||||
|
"io.opentelemetry.semconv", "1.32.0",
|
||||||
|
"io.opentelemetry.proto", "1.3.2");
|
||||||
|
|
||||||
private static final String MAVEN_CENTRAL_BASE_URL =
|
private static final String MAVEN_CENTRAL_BASE_URL =
|
||||||
"https://search.maven.org/solrsearch/select?q=g:";
|
"https://search.maven.org/solrsearch/select?q=g:";
|
||||||
private static final String JAVA_DOCS_BASE_URL = "https://javadoc.io/doc/";
|
private static final String JAVA_DOCS_BASE_URL = "https://javadoc.io/doc/";
|
||||||
|
@ -41,23 +52,34 @@ public final class JavaDocsCrawler {
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
HttpClient client = HttpClient.newHttpClient();
|
HttpClient client = HttpClient.newHttpClient();
|
||||||
List<Artifact> artifacts = getArtifacts(client);
|
|
||||||
if (artifacts.isEmpty()) {
|
|
||||||
logger.log(Level.SEVERE, "No artifacts found");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
logger.info(String.format(Locale.ROOT, "Found %d artifacts", artifacts.size()));
|
|
||||||
|
|
||||||
List<String> updated = crawlJavaDocs(client, artifacts);
|
for (Map.Entry<String, String> groupAndMinVersion : GROUPS_AND_MIN_VERSION.entrySet()) {
|
||||||
if (updated.isEmpty()) {
|
String group = groupAndMinVersion.getKey();
|
||||||
logger.info("No updates were needed");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Artifacts that triggered updates:\n" + String.join("\n", updated));
|
List<Artifact> artifacts = getArtifacts(client, group);
|
||||||
|
if (artifacts.isEmpty()) {
|
||||||
|
logger.log(Level.SEVERE, "No artifacts found for group " + group);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
logger.info(
|
||||||
|
String.format(Locale.ROOT, "Found %d artifacts for group " + group, artifacts.size()));
|
||||||
|
|
||||||
|
List<Artifact> updated = crawlJavaDocs(client, groupAndMinVersion.getValue(), artifacts);
|
||||||
|
if (updated.isEmpty()) {
|
||||||
|
logger.info("No updates were needed for group " + group);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Artifacts that triggered updates for group "
|
||||||
|
+ group
|
||||||
|
+ ":\n"
|
||||||
|
+ updated.stream().map(Artifact::toString).collect(Collectors.joining("\n")));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static List<Artifact> getArtifacts(HttpClient client) throws IOException, InterruptedException {
|
static List<Artifact> getArtifacts(HttpClient client, String group)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
int start = 0;
|
int start = 0;
|
||||||
Integer numFound;
|
Integer numFound;
|
||||||
List<Artifact> result = new ArrayList<>();
|
List<Artifact> result = new ArrayList<>();
|
||||||
|
@ -67,7 +89,7 @@ public final class JavaDocsCrawler {
|
||||||
Thread.sleep(THROTTLE_MS); // try not to DDoS the site, it gets knocked over easily
|
Thread.sleep(THROTTLE_MS); // try not to DDoS the site, it gets knocked over easily
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<?, ?> map = queryMavenCentral(client, start);
|
Map<?, ?> map = queryMavenCentral(client, group, start);
|
||||||
|
|
||||||
numFound =
|
numFound =
|
||||||
Optional.ofNullable(map)
|
Optional.ofNullable(map)
|
||||||
|
@ -93,18 +115,18 @@ public final class JavaDocsCrawler {
|
||||||
List<Artifact> artifacts = new ArrayList<>();
|
List<Artifact> artifacts = new ArrayList<>();
|
||||||
for (Object doc : docs) {
|
for (Object doc : docs) {
|
||||||
Map<?, ?> docMap = (Map<?, ?>) doc;
|
Map<?, ?> docMap = (Map<?, ?>) doc;
|
||||||
String artifact = (String) docMap.get("a");
|
String group = Objects.requireNonNull((String) docMap.get("g"), "g");
|
||||||
String version = (String) docMap.get("latestVersion");
|
String artifact = Objects.requireNonNull((String) docMap.get("a"), "a");
|
||||||
if (artifact != null && version != null) {
|
String version =
|
||||||
artifacts.add(new Artifact(artifact, version));
|
Objects.requireNonNull((String) docMap.get("latestVersion"), "latestVersion");
|
||||||
}
|
artifacts.add(new Artifact(Objects.requireNonNull(group), artifact, version));
|
||||||
}
|
}
|
||||||
return artifacts;
|
return artifacts;
|
||||||
})
|
})
|
||||||
.orElseGet(ArrayList::new);
|
.orElseGet(ArrayList::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<?, ?> queryMavenCentral(HttpClient client, int start)
|
private static Map<?, ?> queryMavenCentral(HttpClient client, String group, int start)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
URI uri =
|
URI uri =
|
||||||
URI.create(
|
URI.create(
|
||||||
|
@ -112,7 +134,7 @@ public final class JavaDocsCrawler {
|
||||||
Locale.ROOT,
|
Locale.ROOT,
|
||||||
"%s%s&rows=%d&start=%d&wt=json",
|
"%s%s&rows=%d&start=%d&wt=json",
|
||||||
MAVEN_CENTRAL_BASE_URL,
|
MAVEN_CENTRAL_BASE_URL,
|
||||||
GROUP,
|
group,
|
||||||
PAGE_SIZE,
|
PAGE_SIZE,
|
||||||
start));
|
start));
|
||||||
|
|
||||||
|
@ -122,21 +144,35 @@ public final class JavaDocsCrawler {
|
||||||
if (response.statusCode() != 200) {
|
if (response.statusCode() != 200) {
|
||||||
logger.log(
|
logger.log(
|
||||||
Level.SEVERE,
|
Level.SEVERE,
|
||||||
"Unexpected response code: " + response.statusCode() + ": " + response.body());
|
"Unexpected response code "
|
||||||
|
+ response.statusCode()
|
||||||
|
+ " for uri: "
|
||||||
|
+ uri.toASCIIString()
|
||||||
|
+ "\n"
|
||||||
|
+ response.body());
|
||||||
throw new IOException("Unable to pull Maven central artifacts list");
|
throw new IOException("Unable to pull Maven central artifacts list");
|
||||||
}
|
}
|
||||||
return objectMapper.readValue(response.body(), Map.class);
|
return objectMapper.readValue(response.body(), Map.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
|
static List<Artifact> crawlJavaDocs(
|
||||||
|
HttpClient client, String minVersion, List<Artifact> artifacts)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
List<String> updatedArtifacts = new ArrayList<>();
|
List<Artifact> updatedArtifacts = new ArrayList<>();
|
||||||
|
|
||||||
for (Artifact artifact : artifacts) {
|
for (Artifact artifact : artifacts) {
|
||||||
|
if (artifact.getVersion().compareTo(minVersion) < 0) {
|
||||||
|
logger.info(
|
||||||
|
String.format(
|
||||||
|
"Skipping crawling %s due to version %s being less than minVersion %s",
|
||||||
|
artifact, artifact.getVersion(), minVersion));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
String[] parts = artifact.getName().split("-");
|
String[] parts = artifact.getName().split("-");
|
||||||
StringBuilder path = new StringBuilder();
|
StringBuilder path = new StringBuilder();
|
||||||
path.append(JAVA_DOCS_BASE_URL)
|
path.append(JAVA_DOCS_BASE_URL)
|
||||||
.append(GROUP)
|
.append(artifact.getGroup())
|
||||||
.append("/")
|
.append("/")
|
||||||
.append(artifact.getName())
|
.append(artifact.getName())
|
||||||
.append("/")
|
.append("/")
|
||||||
|
@ -146,6 +182,7 @@ public final class JavaDocsCrawler {
|
||||||
.append("/package-summary.html");
|
.append("/package-summary.html");
|
||||||
|
|
||||||
HttpRequest crawlRequest = HttpRequest.newBuilder(URI.create(path.toString())).GET().build();
|
HttpRequest crawlRequest = HttpRequest.newBuilder(URI.create(path.toString())).GET().build();
|
||||||
|
logger.info(String.format("Crawling %s at: %s", artifact, path));
|
||||||
HttpResponse<String> crawlResponse =
|
HttpResponse<String> crawlResponse =
|
||||||
client.send(crawlRequest, HttpResponse.BodyHandlers.ofString());
|
client.send(crawlRequest, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
@ -156,7 +193,7 @@ public final class JavaDocsCrawler {
|
||||||
String.format(
|
String.format(
|
||||||
Locale.ROOT,
|
Locale.ROOT,
|
||||||
"Crawl failed for %s with status code %d at URL %s\nResponse: %s",
|
"Crawl failed for %s with status code %d at URL %s\nResponse: %s",
|
||||||
artifact.getName(),
|
artifact,
|
||||||
crawlResponse.statusCode(),
|
crawlResponse.statusCode(),
|
||||||
path,
|
path,
|
||||||
crawlResponse.body()));
|
crawlResponse.body()));
|
||||||
|
@ -164,7 +201,7 @@ public final class JavaDocsCrawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlResponse.body().contains(JAVA_DOC_DOWNLOADED_TEXT)) {
|
if (crawlResponse.body().contains(JAVA_DOC_DOWNLOADED_TEXT)) {
|
||||||
updatedArtifacts.add(artifact.getName());
|
updatedArtifacts.add(artifact);
|
||||||
}
|
}
|
||||||
|
|
||||||
Thread.sleep(THROTTLE_MS); // some light throttling
|
Thread.sleep(THROTTLE_MS); // some light throttling
|
||||||
|
|
|
@ -16,7 +16,6 @@ import java.io.IOException;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
@ -39,8 +38,8 @@ class JavaDocsCrawlerTest {
|
||||||
"response": {
|
"response": {
|
||||||
"numFound": 40,
|
"numFound": 40,
|
||||||
"docs": [
|
"docs": [
|
||||||
{"a": "artifact1", "latestVersion": "1.0"},
|
{"g": "group", "a": "artifact1", "latestVersion": "1.0"},
|
||||||
{"a": "artifact2", "latestVersion": "1.1"}
|
{"g": "group", "a": "artifact2", "latestVersion": "1.1"}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,7 +50,7 @@ class JavaDocsCrawlerTest {
|
||||||
"response": {
|
"response": {
|
||||||
"numFound": 40,
|
"numFound": 40,
|
||||||
"docs": [
|
"docs": [
|
||||||
{"a": "artifact3", "latestVersion": "2.0"}
|
{"g": "group", "a": "artifact3", "latestVersion": "2.0"}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -66,7 +65,7 @@ class JavaDocsCrawlerTest {
|
||||||
.thenReturn(mockMavenCentralRequest1)
|
.thenReturn(mockMavenCentralRequest1)
|
||||||
.thenReturn(mockMavenCentralRequest2);
|
.thenReturn(mockMavenCentralRequest2);
|
||||||
|
|
||||||
List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient);
|
List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient, "io.opentelemetry");
|
||||||
|
|
||||||
// 2 calls for the pagination
|
// 2 calls for the pagination
|
||||||
verify(mockClient, times(2)).send(any(), any());
|
verify(mockClient, times(2)).send(any(), any());
|
||||||
|
@ -75,8 +74,7 @@ class JavaDocsCrawlerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testCrawler() throws IOException, InterruptedException {
|
void testCrawler() throws IOException, InterruptedException {
|
||||||
List<Artifact> artifacts = new ArrayList<>();
|
Artifact artifact = new Artifact("io.opentelemetry", "opentelemetry-context", "1.49.0");
|
||||||
artifacts.add(new Artifact("opentelemetry-context", "1.49.0"));
|
|
||||||
ArgumentCaptor<HttpRequest> requestCaptor = ArgumentCaptor.forClass(HttpRequest.class);
|
ArgumentCaptor<HttpRequest> requestCaptor = ArgumentCaptor.forClass(HttpRequest.class);
|
||||||
|
|
||||||
when(mockJavaDocResponse.body()).thenReturn(JAVA_DOC_DOWNLOADED_TEXT);
|
when(mockJavaDocResponse.body()).thenReturn(JAVA_DOC_DOWNLOADED_TEXT);
|
||||||
|
@ -84,13 +82,13 @@ class JavaDocsCrawlerTest {
|
||||||
|
|
||||||
when(mockClient.send(any(), any())).thenReturn(mockJavaDocResponse);
|
when(mockClient.send(any(), any())).thenReturn(mockJavaDocResponse);
|
||||||
|
|
||||||
List<String> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, artifacts);
|
List<Artifact> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, "1.49.0", List.of(artifact));
|
||||||
|
|
||||||
verify(mockClient, times(1)).send(requestCaptor.capture(), any());
|
verify(mockClient, times(1)).send(requestCaptor.capture(), any());
|
||||||
|
|
||||||
assertThat(requestCaptor.getValue().uri().toString())
|
assertThat(requestCaptor.getValue().uri().toString())
|
||||||
.isEqualTo(
|
.isEqualTo(
|
||||||
"https://javadoc.io/doc/io.opentelemetry/opentelemetry-context/1.49.0/opentelemetry/context/package-summary.html");
|
"https://javadoc.io/doc/io.opentelemetry/opentelemetry-context/1.49.0/opentelemetry/context/package-summary.html");
|
||||||
assertThat(updated).containsExactly("opentelemetry-context");
|
assertThat(updated).containsExactly(artifact);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue