Updated README

samdutton · Oct 3, 2023 · bf4655b · bf4655b
1 parent e9389b7
commit bf4655b
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,14 @@
+*~
+.bash_history
+.DS_Store
+.swp
+.vscode/
+client_secret.json
+default.profraw
+node_modules
+oauth2.keys.json
+keys.json
+credentials.json
+old
+tmp
+data
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # URL parts
 
-View the site, origin, and other parts of a URL.
+View the site, origin, and other parts of a URL: [url-parts.glitch.me](https://url-parts.glitch.me).
 
 
 # Comments and suggestions

diff --git a/js/main.js b/js/main.js
@@ -1,6 +1,8 @@
 /* Copyright 2023 Google LLC.
 SPDX-License-Identifier: Apache-2.0 */
 
+// Public Suffix List: https://publicsuffix.org/list/
+
 // Public Suffix List: https://publicsuffix.org/list/
 import pslEntries from './psl.js';
 // https://data.iana.org/TLD/tlds-alpha-by-domain.txt
@@ -23,7 +25,7 @@ if (urlParam) {
 handleUrl();
 
 function handleUrl() {
-  let urlText = urlInput.value;
+  const urlText = urlInput.value;
   // console.log('urlText:', urlText);
 
   // Begin by removing `?url= ...` search string.
@@ -59,9 +61,9 @@ function handleUrl() {
     // Hack to allow URLs without scheme.
     url = urlText.match(/^https?:\/\//) ? new URL(urlText) :
       new URL(`https://${urlText}`);
-  } catch {
-    console.log(`${urlText} is not a valid URL`);
-    urlPartsDiv.innerHTML = '';
+  } catch (error) {
+    console.log(`${urlText} is not a valid URL`, error);
+    urlPartsDiv.innerHTML = 'Not a valid URL.';
     return;
   }
 
@@ -117,72 +119,89 @@ function handleUrl() {
     return;
   }
 
-  let etldRegExp;
-  // Get the eTLD and eTLD+1.
-  const etld = pslEntries.find((el) => {
-    etldRegExp = new RegExp(`\\w+.${el}$`);
-    // if (el === 'co.uk') {
-    // }
-    return hostname.match(etldRegExp);
-  });
-  let etld1;
-  if (etld) {
-    const etld1RegExp = new RegExp(`[^\/\.]+\.${etld}`);
-    etld1 = urlText.match(etld1RegExp) && urlText.match(etld1RegExp)[0];
+  // Get the eTLD: this is the longest entry in the PSL
+  // Note that the PSL includes single-part entries (com, au, etc.)
+  // as well as multi-part entries (currently up to five parts).
+  // All assigned TLDs in the Root Zone Database are in the PSL.
+
+  let etld = '';
+  for (const pslEntry of pslEntries) {
+    // Hostname is not valid if it matches a PSL entry.
+    if (hostname === pslEntry) {
+      urlPartsDiv.innerHTML = `Not a valid URL: hostname <span id="input-hostname">${hostname}</span> is an ` +
+        `eTLD (see the <a href="https://publicsuffix.org/">Public Suffix List</a>).`;
+      return;
+    }
+
+    // Check for match at end of hostname only.
+    // Need to add \\. to avoid accepting hostnames that end in a valid (e)TLD, such as 'web.xcom'.
+    const pslEntryRegExp = new RegExp(`\\.${pslEntry.replaceAll('.', '\.')}$`);
+    // Find the longest eTLD in the PSL that matches the hostname (e.g. 'co.uk' rather than just 'co').
+    if (hostname.match(pslEntryRegExp) && pslEntry.length > etld.length) {
+      etld = pslEntry;
+    }
   }
 
+  console.log('etld', etld);
+
+  if (!etld) {
+    urlPartsDiv.innerHTML = `No eTLD from the <a href="https://publicsuffix.org/">Public Suffix List</a>` +
+      ` found in hostname <span id="input-hostname">${hostname}</span>.`;
+    return;
+  }
+
+  const etld1 = hostname.match(`[^\/\.]+\.${etld}`)[0];
+
+  if (!etld1) {
+    replace(`eTLD ${etld} specified, but no eTLD+1.`)
+  }
+
+
   // The spans need to wrap the URL from the outside in:
   // origin > originWithoutPort > hostname > site > eTLD+1 > eTLD > TLD.
 
   urlPartsDiv.innerHTML = urlText.
     replace(origin, `<span id="origin">${origin}</span>`);
 
-  // According to the URL standard, site must now include a scheme, so add a
-  // dotted border between the TLD+1 or eTLD+1 and the scheme.
+  // Although the URL standard now mandates that a site must include a scheme,
+  // span#site only wraps the eTLD+1.
+  // The scheme border is connected with the span#site border by a dotted border,
+  // by wrapping the whole origin (except the port) in span#site-dotted.
   if (scheme) {
     const siteDottedRegExp = new RegExp(`${scheme}.+${hostname}`);
     urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.
       replace(siteDottedRegExp, '<span id="site-dotted">$&</span>');
   }
 
-  // If the URL has a scheme, add a span to add a dashed border for site
-  // between the scheme and the rest of the site.
   urlPartsDiv.innerHTML =
     urlPartsDiv.innerHTML.replace(hostname, `<span id="hostname">${hostname}</span>`);
 
-  // Although the URL standard now mandates that a site must include a scheme,
-  // span#site only wraps the eTLD+1 or TLD+1.
-  // The scheme border is connected with the span#site border by a dotted border,
-  // by wrapping the whole origin (except the port) in span#site-dotted.
+  replace(etld1,
+    `<span id="etld1">${etld1}</span>`);
 
-  // If the URL uses an eTLD, add spans for eTLD+1 and eTLD.
-  if (etld) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(etld1,
-      `<span id="etld1">${etld1}</span>`);
-    // Site now requires scheme (according to the URL standard).
-    if (scheme) {
-      urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(`<span id="etld1">${etld1}</span>`,
-        `<span id="etld1"><span id="site">${etld1}</span></span>`);
-    }
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(etld,
-      `<span id="etld">${etld}</span>`);
-  // Site now requires scheme.
-  } else if (scheme) {
-    // Not eTLD, so site is TLD+1.
+  // Site now requires scheme according to the URL standard,
+  // so a dotted line is added between the scheme and the other parts of site (see above).
+  if (scheme) {
+    replace(`<span id="etld1">${etld1}</span>`,
+      `<span id="etld1"><span id="site">${etld1}</span></span>`);
     const site = hostname.split('.').slice(-2).join('.');
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(site, `<span id="site">${site}</span>`);
+    // replace(site, `<span id="site">${site}</span>`);
   }
 
+  replace(etld,
+    `<span id="etld">${etld}</span>`);
+
   // Wrap TLD in a span.
   // If the hostname includes an eTLD, urlPartsDiv.innerHTML will be wrapped in a span.
   // Otherwise, the whole hostname will be wrapped in a span.
   const tld = hostname.split('.').pop();
-  // Check if tld is in the list at js/tld.js from the Root Zone Database.
+
+  // Double check that tld is in the list at js/tld.js from the Root Zone Database.
+  // All TLDs should also be in the PSL (checked earlier) so at this point the tld should always be valid.
   if (tldEntries.includes(tld.toUpperCase())) {
-    const partBeforeTld = hostname.split('.').slice(-2, -1);
-    const tldRegExp = new RegExp(`${partBeforeTld}.(${tld})`);
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(tldRegExp,
-      partBeforeTld + '.<span id="tld">$1</span>');
+    // The TLD is the last part of span#etld
+    const tldRegExp = new RegExp(`(<span id="etld">[^<]*)(${tld})`);
+    replace(tldRegExp, '$1<span id="tld">$2</span>');
   } else {
     urlPartsDiv.innerHTML = 'TLD not found in the ' +
       '<a href="https://www.iana.org/domains/root/db">Root Zone Database</a>.';
@@ -192,50 +211,53 @@ function handleUrl() {
   // Hack: if the pathname is / then highlight the / after the origin
   // (not a / after the scheme).
   if (pathname === '/') {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(/\/$/,
+    replace(/\/$/,
       `<span id="pathname">/</span>`);
   } else if (pathname) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(pathname,
+    replace(pathname,
       `<span id="pathname">${pathname}</span>`);
   }
 
   if (filename) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(filename,
+    replace(filename,
       `<span id="filename">${filename}</span>`);
   }
   if (hash) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(hash,
+    replace(hash,
       `<span id="hash">${hash}</span>`);
   }
 
-  // TODO: surprisingly complex to get this to work with other URL parts!
-  // if (password) {
-  //   urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(`:${password}@`,
-  //     `:<span id="password">${password}</span>@`);
-  // }
+  // // TODO: surprisingly complex to get this to work with other URL parts!
+  // // if (password) {
+  // replace(`:${password}@`,
+  // //     `:<span id="password">${password}</span>@`);
+  // // }
 
   if (port) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(`:${port}`,
+    replace(`:${port}`,
       `:<span id="port">${port}</span>`);
   }
   if (scheme) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(scheme,
+    replace(scheme,
       // `<span id="scheme">${scheme}</span>`);
       `<span id="scheme"><span id="origin-scheme"><span id="site-scheme">` +
           `${scheme}</span></span></span>`);
   }
   // If the URL has a hash value *and* a search string,
   // the URL API (for hash) returns the hash and the search string.
   if (search) {
-    urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(search,
+    replace(search,
       `<span id="search">${search}</span>`);
   }
+}
+
 
-  // TODO: surprisingly complex to get this to work with other URL parts!
-  // if (username) {
-  //   const usernameRegExp = new RegExp(`${username}([@:])`);
-  //   urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(usernameRegExp,
-  //     `<span id="username">${username}</span>$1`);
-  // }
-};
+// Utility functions
 
+function log(label) {
+  console.log(label, urlPartsDiv.innerHTML);
+}
+
+function replace(pattern, replacement) {
+  urlPartsDiv.innerHTML = urlPartsDiv.innerHTML.replace(pattern, replacement);
+}
diff --git a/js/psl.js b/js/psl.js