Compare commits
23 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
215b736472 | ||
|
fa9fa1dca6 | ||
|
3410be52fa | ||
|
cfb7f1dcb9 | ||
|
ffa15e4178 | ||
|
9de202f90c | ||
|
8f7a752e47 | ||
|
5e40532a5f | ||
|
533100b43c | ||
|
e6eef9b5e9 | ||
|
91584d565f | ||
|
0329d875e8 | ||
|
af0a771645 | ||
|
d42965972c | ||
|
5aa344aa2d | ||
|
34a380dd19 | ||
|
f15acb849b | ||
|
d9de62f265 | ||
|
9572c4bdcd | ||
|
1cb66fb794 | ||
|
63f74932c5 | ||
|
5cff50d1d9 | ||
|
30e0110cb7 |
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "is-wordpress"]
|
||||||
|
path = is-wordpress
|
||||||
|
url = git@github.com:tanrax/is-wordpress.git
|
898391
2019-top-1m.csv
Normal file
898391
2019-top-1m.csv
Normal file
File diff suppressed because it is too large
Load Diff
888703
2020-top-1m.csv
Normal file
888703
2020-top-1m.csv
Normal file
File diff suppressed because it is too large
Load Diff
437884
2021-top-1m.csv
Normal file
437884
2021-top-1m.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
2021-tranco-5.5m.sqlite.tar.bz2
Normal file
BIN
2021-tranco-5.5m.sqlite.tar.bz2
Normal file
Binary file not shown.
BIN
2021-tranco-5.5m.tar.bz2
Normal file
BIN
2021-tranco-5.5m.tar.bz2
Normal file
Binary file not shown.
1
Makefile
1
Makefile
@ -9,6 +9,7 @@ prepare: ## Download and format csv with domains
|
|||||||
unzip top-1m.csv.zip
|
unzip top-1m.csv.zip
|
||||||
rm top-1m.csv.zip
|
rm top-1m.csv.zip
|
||||||
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
|
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
|
||||||
|
mkdir -p resources
|
||||||
mv top-1m-temp.csv resources/top-1m.csv
|
mv top-1m-temp.csv resources/top-1m.csv
|
||||||
rm top-1m.csv
|
rm top-1m.csv
|
||||||
|
|
||||||
|
40
README.md
40
README.md
@ -1,7 +1,17 @@
|
|||||||
|
# Calculate WordPress usage worldwide
|
||||||
|
|
||||||
|
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
|
||||||
|
|
||||||
|
Warning that it can take a long time: between 20 to 30 days.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
To run it you'll need either 2Gb of RAM or a swap file of the same size.
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
``` sh
|
``` sh
|
||||||
sudo apt install clojure leiningen wget make
|
sudo apt install clojure leiningen wget make curl bash unzip
|
||||||
```
|
```
|
||||||
|
|
||||||
## Prepare
|
## Prepare
|
||||||
@ -13,7 +23,31 @@ make prepare
|
|||||||
## Run
|
## Run
|
||||||
|
|
||||||
``` sh
|
``` sh
|
||||||
make run
|
lein run
|
||||||
```
|
```
|
||||||
|
|
||||||
OutOfMemoryError: Java heap space
|
When all the CSV sites are analyzed, you can see the final figure by running the following script
|
||||||
|
|
||||||
|
## Calculate percentage
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
bash calculate-percentage.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Historical
|
||||||
|
|
||||||
|
### 2019
|
||||||
|
|
||||||
|
19%
|
||||||
|
|
||||||
|
### 2020
|
||||||
|
|
||||||
|
29%
|
||||||
|
|
||||||
|
## Article with conclusions (in Spanish)
|
||||||
|
|
||||||
|
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
|
||||||
|
|
||||||
|
## Hacker News (comments)
|
||||||
|
|
||||||
|
https://news.ycombinator.com/item?id=21428149
|
||||||
|
8
calculate-percentage.sh
Normal file
8
calculate-percentage.sh
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
CSV="resources/top-1m.csv"
|
||||||
|
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
|
||||||
|
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
|
||||||
|
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
|
||||||
|
TOTAL=$(cat $CSV | wc -l)
|
||||||
|
|
||||||
|
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'
|
1
is-wordpress
Submodule
1
is-wordpress
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit b64770eb63dcb3924957bed824e79d380089e2c2
|
@ -1,4 +0,0 @@
|
|||||||
1,google.com,true
|
|
||||||
2,youtube.com,false
|
|
||||||
4,tmall.com,nil
|
|
||||||
5,idecrea.es,nil
|
|
|
@ -1,26 +1,13 @@
|
|||||||
(ns wordpress-used.core
|
(ns wordpress-used.core
|
||||||
(:require
|
(:require
|
||||||
[clj-http.client :as client]
|
|
||||||
[clojure.data.csv :as csv]
|
[clojure.data.csv :as csv]
|
||||||
[clojure.java.io :as io]
|
[clojure.java.io :as io]
|
||||||
[clojure.java.shell :as shell]
|
[clojure.java.shell :as shell]) (:gen-class))
|
||||||
) (:gen-class))
|
|
||||||
|
|
||||||
(def headers {"User-Agent" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"
|
(defn wordpress?
|
||||||
"Accept" "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
"Check if a web page is generated with WordPress"
|
||||||
"Accept-Language" "es,en-US;q=0.7,en;q=0.3"
|
[url]
|
||||||
"Accept-Encoding" "gzip, deflate, br"
|
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
|
||||||
"DNT" "1"
|
|
||||||
"Connection" "keep-alive"
|
|
||||||
"Upgrade-Insecure-Requests" "1"
|
|
||||||
"Pragma" "no-cache"
|
|
||||||
"Cache-Control" "no-cache"
|
|
||||||
"TE" "Trailers"})
|
|
||||||
(def http-config
|
|
||||||
{:headers headers
|
|
||||||
:ignore-unknown-host? true
|
|
||||||
:connection-timeout 5000
|
|
||||||
:throw-exceptions false})
|
|
||||||
|
|
||||||
(defn read-csv-domains
|
(defn read-csv-domains
|
||||||
"Read CSV file with all domains"
|
"Read CSV file with all domains"
|
||||||
@ -28,29 +15,38 @@
|
|||||||
(with-open [reader (io/reader (io/resource url))]
|
(with-open [reader (io/reader (io/resource url))]
|
||||||
(doall (csv/read-csv reader))))
|
(doall (csv/read-csv reader))))
|
||||||
|
|
||||||
(defn wordpress?
|
(defn get-chunk-list
|
||||||
"Check if a web page is generated with WordPress"
|
"Cuts a list by the maximum number of fragments and returns the selected fragment."
|
||||||
[url]
|
[items-list chunk max-chunks]
|
||||||
(try
|
(let [list-size (count items-list)
|
||||||
(let [response (client/get (str "http://" url "/") http-config)]
|
chunk-size (Math/ceil (/ list-size max-chunks))
|
||||||
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))]))
|
chunk-start (int (* chunk chunk-size))
|
||||||
(catch Exception e
|
chunk-end (int (+ (* chunk chunk-size) chunk-size))]
|
||||||
"timeout")))
|
(subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
|
||||||
|
|
||||||
|
(defn analyse-list-chunk
|
||||||
|
"Analyse only the given list one chunck"
|
||||||
|
[items-list chunk max-chunks]
|
||||||
|
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
|
||||||
|
url (get domain-data 1)]
|
||||||
|
;; Show info
|
||||||
|
(prn (str line " " url))
|
||||||
|
;; Edit domains-csv with check WordPress
|
||||||
|
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
|
||||||
|
|
||||||
(defn -main
|
(defn -main
|
||||||
[& args]
|
[& args]
|
||||||
(let [;; Name of the file containing the CSV with the domains
|
(let [;; Name of the file containing the CSV with the domains
|
||||||
file-csv "top-1m.csv"
|
file-csv (first args)
|
||||||
|
;; Number of threads to be executed.
|
||||||
|
number-of-threads (second args)
|
||||||
;; Get domains from CSV
|
;; Get domains from CSV
|
||||||
domains-csv (vec (read-csv-domains file-csv))
|
domains-csv (vec (read-csv-domains file-csv))
|
||||||
;; Filters leaving those that have not been checked
|
;; Filters leaving those that have not been checked
|
||||||
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
|
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
|
||||||
;; List with domains with a boolean indicating if it is generate or not in WordPress
|
;; List with domains with a boolean indicating if it is generate or not in WordPress
|
||||||
(prn "Start")
|
(prn "Start")
|
||||||
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
|
(dotimes [i number-of-threads] (.start (Thread. (fn []
|
||||||
url (get domain-data 1)]
|
(analyse-list-chunk domains-uncheked i number-of-threads)))))
|
||||||
;; Show info
|
|
||||||
(prn (str line " " url))
|
|
||||||
;; Edit domains-csv with check WordPress
|
|
||||||
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
|
|
||||||
(prn "Complete")))
|
(prn "Complete")))
|
||||||
|
Loading…
Reference in New Issue
Block a user