Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
8f09d77647 | ||
|
de49bade47 |
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
||||
[submodule "is-wordpress"]
|
||||
path = is-wordpress
|
||||
url = git@github.com:tanrax/is-wordpress.git
|
898391
2019-top-1m.csv
898391
2019-top-1m.csv
File diff suppressed because it is too large
Load Diff
888703
2020-top-1m.csv
888703
2020-top-1m.csv
File diff suppressed because it is too large
Load Diff
437884
2021-top-1m.csv
437884
2021-top-1m.csv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
4
Makefile
4
Makefile
@ -9,9 +9,11 @@ prepare: ## Download and format csv with domains
|
||||
unzip top-1m.csv.zip
|
||||
rm top-1m.csv.zip
|
||||
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
|
||||
mkdir -p resources
|
||||
mv top-1m-temp.csv resources/top-1m.csv
|
||||
rm top-1m.csv
|
||||
|
||||
run: ## Run checks
|
||||
lein run
|
||||
|
||||
calculate: ## Calculate the percentage
|
||||
(awk 'END {print NR}' resources/top-1m.csv) / (cat resources/top-1m.csv | grep ',true' | wc -l)
|
||||
|
40
README.md
40
README.md
@ -1,17 +1,7 @@
|
||||
# Calculate WordPress usage worldwide
|
||||
|
||||
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
|
||||
|
||||
Warning that it can take a long time: between 20 to 30 days.
|
||||
|
||||
## Requirements
|
||||
|
||||
To run it you'll need either 2Gb of RAM or a swap file of the same size.
|
||||
|
||||
## Install
|
||||
|
||||
``` sh
|
||||
sudo apt install clojure leiningen wget make curl bash unzip
|
||||
sudo apt install clojure leiningen wget make
|
||||
```
|
||||
|
||||
## Prepare
|
||||
@ -23,31 +13,7 @@ make prepare
|
||||
## Run
|
||||
|
||||
``` sh
|
||||
lein run
|
||||
make run
|
||||
```
|
||||
|
||||
When all the CSV sites are analyzed, you can see the final figure by running the following script
|
||||
|
||||
## Calculate percentage
|
||||
|
||||
``` sh
|
||||
bash calculate-percentage.sh
|
||||
```
|
||||
|
||||
## Historical
|
||||
|
||||
### 2019
|
||||
|
||||
19%
|
||||
|
||||
### 2020
|
||||
|
||||
29%
|
||||
|
||||
## Article with conclusions (in Spanish)
|
||||
|
||||
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
|
||||
|
||||
## Hacker News (comments)
|
||||
|
||||
https://news.ycombinator.com/item?id=21428149
|
||||
OutOfMemoryError: Java heap space
|
||||
|
@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
CSV="resources/top-1m.csv"
|
||||
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
|
||||
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
|
||||
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
|
||||
TOTAL=$(cat $CSV | wc -l)
|
||||
|
||||
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'
|
@ -1 +0,0 @@
|
||||
Subproject commit b64770eb63dcb3924957bed824e79d380089e2c2
|
@ -3,8 +3,7 @@
|
||||
:url "http://example.com/FIXME"
|
||||
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
|
||||
:url "https://www.eclipse.org/legal/epl-2.0/"}
|
||||
:dependencies [[org.clojure/clojure "1.10.0"]
|
||||
[clj-http "3.10.0"]
|
||||
:dependencies [[org.clojure/clojure "1.10.1"]
|
||||
[org.clojure/data.csv "0.1.4"]]
|
||||
:jvm-opts ["-Xmx1G"]
|
||||
:main ^:skip-aot wordpress-used.core
|
||||
|
4
resources/top-1m-test.csv
Normal file
4
resources/top-1m-test.csv
Normal file
@ -0,0 +1,4 @@
|
||||
1,google.com,true
|
||||
2,youtube.com,false
|
||||
4,tmall.com,nil
|
||||
5,idecrea.es,nil
|
|
@ -2,12 +2,13 @@
|
||||
(:require
|
||||
[clojure.data.csv :as csv]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.java.shell :as shell]) (:gen-class))
|
||||
[clojure.java.shell :as shell]
|
||||
) (:gen-class))
|
||||
|
||||
(defn wordpress?
|
||||
"Check if a web page is generated with WordPress"
|
||||
(defn request
|
||||
"Make a request by means of curl"
|
||||
[url]
|
||||
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
|
||||
(shell/sh "curl" "-L" "--max-time" "5" "-H" "User-Agent: Firefox" url))
|
||||
|
||||
(defn read-csv-domains
|
||||
"Read CSV file with all domains"
|
||||
@ -15,38 +16,26 @@
|
||||
(with-open [reader (io/reader (io/resource url))]
|
||||
(doall (csv/read-csv reader))))
|
||||
|
||||
(defn get-chunk-list
|
||||
"Cuts a list by the maximum number of fragments and returns the selected fragment."
|
||||
[items-list chunk max-chunks]
|
||||
(let [list-size (count items-list)
|
||||
chunk-size (Math/ceil (/ list-size max-chunks))
|
||||
chunk-start (int (* chunk chunk-size))
|
||||
chunk-end (int (+ (* chunk chunk-size) chunk-size))]
|
||||
(subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
|
||||
|
||||
(defn analyse-list-chunk
|
||||
"Analyse only the given list one chunck"
|
||||
[items-list chunk max-chunks]
|
||||
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
|
||||
url (get domain-data 1)]
|
||||
;; Show info
|
||||
(prn (str line " " url))
|
||||
;; Edit domains-csv with check WordPress
|
||||
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
|
||||
(defn wordpress?
|
||||
"Check if a web page is generated with WordPress"
|
||||
[url]
|
||||
(let [response (request url)]
|
||||
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:out response))])))
|
||||
|
||||
(defn -main
|
||||
[& args]
|
||||
(let [;; Name of the file containing the CSV with the domains
|
||||
file-csv (first args)
|
||||
;; Number of threads to be executed.
|
||||
number-of-threads (second args)
|
||||
file-csv "top-1m.csv"
|
||||
;; Get domains from CSV
|
||||
domains-csv (vec (read-csv-domains file-csv))
|
||||
;; Filters leaving those that have not been checked
|
||||
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
|
||||
;; List with domains with a boolean indicating if it is generate or not in WordPress
|
||||
(prn "Start")
|
||||
(dotimes [i number-of-threads] (.start (Thread. (fn []
|
||||
(analyse-list-chunk domains-uncheked i number-of-threads)))))
|
||||
|
||||
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
|
||||
url (get domain-data 1)]
|
||||
;; Show info
|
||||
(prn (str line " " url))
|
||||
;; Edit domains-csv with check WordPress
|
||||
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
|
||||
(prn "Complete")))
|
||||
|
Loading…
Reference in New Issue
Block a user