Compare commits

..

2 Commits
master ... curl

Author SHA1 Message Date
Andros Fenollosa
8f09d77647 Add max time 2019-10-13 19:31:18 +02:00
Andros Fenollosa
de49bade47 Update request to curl 2019-10-13 19:01:22 +02:00
13 changed files with 28 additions and 2225058 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "is-wordpress"]
path = is-wordpress
url = git@github.com:tanrax/is-wordpress.git

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -9,9 +9,11 @@ prepare: ## Download and format csv with domains
unzip top-1m.csv.zip unzip top-1m.csv.zip
rm top-1m.csv.zip rm top-1m.csv.zip
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
mkdir -p resources
mv top-1m-temp.csv resources/top-1m.csv mv top-1m-temp.csv resources/top-1m.csv
rm top-1m.csv rm top-1m.csv
run: ## Run checks run: ## Run checks
lein run lein run
calculate: ## Calculate the percentage
(awk 'END {print NR}' resources/top-1m.csv) / (cat resources/top-1m.csv | grep ',true' | wc -l)

View File

@ -1,17 +1,7 @@
# Calculate WordPress usage worldwide
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
Warning that it can take a long time: between 20 to 30 days.
## Requirements
To run it you'll need either 2Gb of RAM or a swap file of the same size.
## Install ## Install
``` sh ``` sh
sudo apt install clojure leiningen wget make curl bash unzip sudo apt install clojure leiningen wget make
``` ```
## Prepare ## Prepare
@ -23,31 +13,7 @@ make prepare
## Run ## Run
``` sh ``` sh
lein run make run
``` ```
When all the CSV sites are analyzed, you can see the final figure by running the following script OutOfMemoryError: Java heap space
## Calculate percentage
``` sh
bash calculate-percentage.sh
```
## Historical
### 2019
19%
### 2020
29%
## Article with conclusions (in Spanish)
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
## Hacker News (comments)
https://news.ycombinator.com/item?id=21428149

View File

@ -1,8 +0,0 @@
#!/bin/bash
CSV="resources/top-1m.csv"
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
TOTAL=$(cat $CSV | wc -l)
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'

@ -1 +0,0 @@
Subproject commit b64770eb63dcb3924957bed824e79d380089e2c2

View File

@ -3,8 +3,7 @@
:url "http://example.com/FIXME" :url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"} :url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.10.0"] :dependencies [[org.clojure/clojure "1.10.1"]
[clj-http "3.10.0"]
[org.clojure/data.csv "0.1.4"]] [org.clojure/data.csv "0.1.4"]]
:jvm-opts ["-Xmx1G"] :jvm-opts ["-Xmx1G"]
:main ^:skip-aot wordpress-used.core :main ^:skip-aot wordpress-used.core

View File

@ -0,0 +1,4 @@
1,google.com,true
2,youtube.com,false
4,tmall.com,nil
5,idecrea.es,nil
1 1 google.com true
2 2 youtube.com false
3 4 tmall.com nil
4 5 idecrea.es nil

View File

@ -2,12 +2,13 @@
(:require (:require
[clojure.data.csv :as csv] [clojure.data.csv :as csv]
[clojure.java.io :as io] [clojure.java.io :as io]
[clojure.java.shell :as shell]) (:gen-class)) [clojure.java.shell :as shell]
) (:gen-class))
(defn wordpress? (defn request
"Check if a web page is generated with WordPress" "Make a request by means of curl"
[url] [url]
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true")) (shell/sh "curl" "-L" "--max-time" "5" "-H" "User-Agent: Firefox" url))
(defn read-csv-domains (defn read-csv-domains
"Read CSV file with all domains" "Read CSV file with all domains"
@ -15,38 +16,26 @@
(with-open [reader (io/reader (io/resource url))] (with-open [reader (io/reader (io/resource url))]
(doall (csv/read-csv reader)))) (doall (csv/read-csv reader))))
(defn get-chunk-list (defn wordpress?
"Cuts a list by the maximum number of fragments and returns the selected fragment." "Check if a web page is generated with WordPress"
[items-list chunk max-chunks] [url]
(let [list-size (count items-list) (let [response (request url)]
chunk-size (Math/ceil (/ list-size max-chunks)) (every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:out response))])))
chunk-start (int (* chunk chunk-size))
chunk-end (int (+ (* chunk chunk-size) chunk-size))]
(subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
(defn analyse-list-chunk
"Analyse only the given list one chunck"
[items-list chunk max-chunks]
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
(defn -main (defn -main
[& args] [& args]
(let [;; Name of the file containing the CSV with the domains (let [;; Name of the file containing the CSV with the domains
file-csv (first args) file-csv "top-1m.csv"
;; Number of threads to be executed.
number-of-threads (second args)
;; Get domains from CSV ;; Get domains from CSV
domains-csv (vec (read-csv-domains file-csv)) domains-csv (vec (read-csv-domains file-csv))
;; Filters leaving those that have not been checked ;; Filters leaving those that have not been checked
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))] domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
;; List with domains with a boolean indicating if it is generate or not in WordPress ;; List with domains with a boolean indicating if it is generate or not in WordPress
(prn "Start") (prn "Start")
(dotimes [i number-of-threads] (.start (Thread. (fn [] (doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
(analyse-list-chunk domains-uncheked i number-of-threads))))) url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
(prn "Complete"))) (prn "Complete")))