jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/356307 )
Change subject: Memoize host normalization
......................................................................
Memoize host normalization
Adding small cache to not repeat computations regarding
normalization of hosts
Bug:T166628
Change-Id: Ic57d331ba445b37c4e5afc7903d16e607998284c
---
M
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
1 file changed, 40 insertions(+), 25 deletions(-)
Approvals:
Joal: Looks good to me, approved
jenkins-bot: Verified
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index cb146fe..e3667ee 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -98,6 +98,12 @@
*/
private Utilities.LRUCache<String, Boolean> agentTypeCache = new
Utilities.LRUCache<>(10000);
+
+ /**
+ * Used to speed up "normalization of hosts"
+ */
+ private Utilities.LRUCache<String, Object> normalizedHostCache = new
Utilities.LRUCache<>(5000);
+
/**
* Pattern for automatically-added subdomains that indicate zero,
* or some similar portal-based interface to MW.
@@ -237,41 +243,50 @@
* @return A NormalizedHostInfo object with project_class, project,
qualifiers and tld values set.
*/
public NormalizedHostInfo normalizeHost(String uriHost) {
+
+ // use LRU cache to not repeat computations
NormalizedHostInfo result = new NormalizedHostInfo();
+
if ((uriHost == null) || (uriHost.isEmpty())) return result;
- // Remove port if any
- int portIdx = uriHost.indexOf(":");
- uriHost = uriHost.substring(0, ((portIdx < 0) ? uriHost.length() :
portIdx));
+ if (normalizedHostCache.containsKey(uriHost.toLowerCase())){
- // Replace multiple dots by only one
- uriHost = uriHost.replaceAll("[//.]+", ".");
+ result =
(NormalizedHostInfo)normalizedHostCache.get(uriHost.toLowerCase());
- // Split by the dots
- String[] uriParts = uriHost.toLowerCase().split("\\.");
+ } else {
+ // Remove port if any
+ int portIdx = uriHost.indexOf(":");
+ uriHost = uriHost.substring(0, ((portIdx < 0) ? uriHost.length() :
portIdx));
- // If no splitted part, return empty
- if (uriParts.length == 0) return result;
+ // Replace multiple dots by only one
+ uriHost = uriHost.replaceAll("[//.]+", ".");
- // Handle special case where TLD is numeric --> assume IP address,
don't normalize
- // Length is > 0 because of previous check, so no error case
- if (uriParts[uriParts.length - 1].matches("[0-9]+")) return result;
+ // Split by the dots
+ String[] uriParts = uriHost.toLowerCase().split("\\.");
- if (uriParts.length > 1) {
- // project_class and TLD normalization
- result.setProjectClass(uriParts[uriParts.length - 2]);
- result.setTld(uriParts[uriParts.length - 1]);
- }
- // project normalization
- if ((uriParts.length > 2) && (! uriParts[0].equals("www")))
- result.setProject(uriParts[0]);
- // qualifiers normalization: xx.[q1.q2.q3].wikixxx.xx
- if (uriParts.length > 3) {
- for (int i = 1; i < uriParts.length - 2; i++) {
- result.addQualifier(uriParts[i]);
+ // If no splitted part, return empty
+ if (uriParts.length == 0) return result;
+
+ // Handle special case where TLD is numeric --> assume IP address,
don't normalize
+ // Length is > 0 because of previous check, so no error case
+ if (uriParts[uriParts.length - 1].matches("[0-9]+")) return result;
+
+ if (uriParts.length > 1) {
+ // project_class and TLD normalization
+ result.setProjectClass(uriParts[uriParts.length - 2]);
+ result.setTld(uriParts[uriParts.length - 1]);
}
+ // project normalization
+ if ((uriParts.length > 2) && (!uriParts[0].equals("www")))
+ result.setProject(uriParts[0]);
+ // qualifiers normalization: xx.[q1.q2.q3].wikixxx.xx
+ if (uriParts.length > 3) {
+ for (int i = 1; i < uriParts.length - 2; i++) {
+ result.addQualifier(uriParts[i]);
+ }
+ }
+ normalizedHostCache.put(uriHost.toLowerCase(),result);
}
-
return result;
}
--
To view, visit https://gerrit.wikimedia.org/r/356307
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ic57d331ba445b37c4e5afc7903d16e607998284c
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Nuria <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits