jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/356307 )

Change subject: Memoize host normalization
......................................................................


Memoize host normalization

Adding small cache to not repeat computations regarding
normalization of hosts

Bug:T166628
Change-Id: Ic57d331ba445b37c4e5afc7903d16e607998284c
---
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
1 file changed, 40 insertions(+), 25 deletions(-)

Approvals:
  Joal: Looks good to me, approved
  jenkins-bot: Verified



diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index cb146fe..e3667ee 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -98,6 +98,12 @@
      */
     private Utilities.LRUCache<String, Boolean> agentTypeCache = new 
Utilities.LRUCache<>(10000);
 
+
+    /**
+     * Used to speed up "normalization of hosts"
+     */
+    private Utilities.LRUCache<String, Object> normalizedHostCache = new 
Utilities.LRUCache<>(5000);
+
     /**
      * Pattern for automatically-added subdomains that indicate zero,
      * or some similar portal-based interface to MW.
@@ -237,41 +243,50 @@
      * @return A NormalizedHostInfo object with project_class, project, 
qualifiers and tld values set.
      */
     public NormalizedHostInfo normalizeHost(String uriHost) {
+
+        // use LRU cache to not repeat computations
         NormalizedHostInfo result = new NormalizedHostInfo();
+
         if ((uriHost == null) || (uriHost.isEmpty())) return result;
 
-        // Remove port if any
-        int portIdx = uriHost.indexOf(":");
-        uriHost = uriHost.substring(0, ((portIdx < 0) ? uriHost.length() : 
portIdx));
+        if (normalizedHostCache.containsKey(uriHost.toLowerCase())){
 
-        // Replace multiple dots by only one
-        uriHost = uriHost.replaceAll("[//.]+", ".");
+            result = 
(NormalizedHostInfo)normalizedHostCache.get(uriHost.toLowerCase());
 
-        // Split by the dots
-        String[] uriParts = uriHost.toLowerCase().split("\\.");
+        }  else {
+            // Remove port if any
+            int portIdx = uriHost.indexOf(":");
+            uriHost = uriHost.substring(0, ((portIdx < 0) ? uriHost.length() : 
portIdx));
 
-        // If no splitted part, return empty
-        if (uriParts.length == 0) return result;
+            // Replace multiple dots by only one
+            uriHost = uriHost.replaceAll("[//.]+", ".");
 
-        // Handle special case where TLD is numeric --> assume IP address, 
don't normalize
-        // Length is > 0 because of previous check, so no error case
-        if (uriParts[uriParts.length - 1].matches("[0-9]+")) return result;
+            // Split by the dots
+            String[] uriParts = uriHost.toLowerCase().split("\\.");
 
-        if (uriParts.length > 1) {
-            // project_class and TLD normalization
-            result.setProjectClass(uriParts[uriParts.length - 2]);
-            result.setTld(uriParts[uriParts.length - 1]);
-        }
-        // project normalization
-        if ((uriParts.length > 2) && (! uriParts[0].equals("www")))
-            result.setProject(uriParts[0]);
-        // qualifiers normalization: xx.[q1.q2.q3].wikixxx.xx
-        if (uriParts.length > 3) {
-            for (int i = 1; i < uriParts.length - 2; i++) {
-                result.addQualifier(uriParts[i]);
+            // If no splitted part, return empty
+            if (uriParts.length == 0) return result;
+
+            // Handle special case where TLD is numeric --> assume IP address, 
don't normalize
+            // Length is > 0 because of previous check, so no error case
+            if (uriParts[uriParts.length - 1].matches("[0-9]+")) return result;
+
+            if (uriParts.length > 1) {
+                // project_class and TLD normalization
+                result.setProjectClass(uriParts[uriParts.length - 2]);
+                result.setTld(uriParts[uriParts.length - 1]);
             }
+            // project normalization
+            if ((uriParts.length > 2) && (!uriParts[0].equals("www")))
+                result.setProject(uriParts[0]);
+            // qualifiers normalization: xx.[q1.q2.q3].wikixxx.xx
+            if (uriParts.length > 3) {
+                for (int i = 1; i < uriParts.length - 2; i++) {
+                    result.addQualifier(uriParts[i]);
+                }
+            }
+            normalizedHostCache.put(uriHost.toLowerCase(),result);
         }
-
         return result;
 
     }

-- 
To view, visit https://gerrit.wikimedia.org/r/356307
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic57d331ba445b37c4e5afc7903d16e607998284c
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Nuria <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to