Compare commits

..

2 Commits
master ... curl

Author SHA1 Message Date
Andros Fenollosa
8f09d77647 Add max time 2019-10-13 19:31:18 +02:00
Andros Fenollosa
de49bade47 Update request to curl 2019-10-13 19:01:22 +02:00
13 changed files with 28 additions and 2225058 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "is-wordpress"]
path = is-wordpress
url = git@github.com:tanrax/is-wordpress.git

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -9,9 +9,11 @@ prepare: ## Download and format csv with domains
unzip top-1m.csv.zip
rm top-1m.csv.zip
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
mkdir -p resources
mv top-1m-temp.csv resources/top-1m.csv
rm top-1m.csv
run: ## Run checks
lein run
calculate: ## Calculate the percentage
(awk 'END {print NR}' resources/top-1m.csv) / (cat resources/top-1m.csv | grep ',true' | wc -l)

View File

@ -1,17 +1,7 @@
# Calculate WordPress usage worldwide
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
Warning that it can take a long time: between 20 to 30 days.
## Requirements
To run it you'll need either 2Gb of RAM or a swap file of the same size.
## Install
``` sh
sudo apt install clojure leiningen wget make curl bash unzip
sudo apt install clojure leiningen wget make
```
## Prepare
@ -23,31 +13,7 @@ make prepare
## Run
``` sh
lein run
make run
```
When all the CSV sites are analyzed, you can see the final figure by running the following script
## Calculate percentage
``` sh
bash calculate-percentage.sh
```
## Historical
### 2019
19%
### 2020
29%
## Article with conclusions (in Spanish)
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
## Hacker News (comments)
https://news.ycombinator.com/item?id=21428149
OutOfMemoryError: Java heap space

View File

@ -1,8 +0,0 @@
#!/bin/bash
CSV="resources/top-1m.csv"
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
TOTAL=$(cat $CSV | wc -l)
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'

@ -1 +0,0 @@
Subproject commit b64770eb63dcb3924957bed824e79d380089e2c2

View File

@ -3,8 +3,7 @@
:url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.10.0"]
[clj-http "3.10.0"]
:dependencies [[org.clojure/clojure "1.10.1"]
[org.clojure/data.csv "0.1.4"]]
:jvm-opts ["-Xmx1G"]
:main ^:skip-aot wordpress-used.core

View File

@ -0,0 +1,4 @@
1,google.com,true
2,youtube.com,false
4,tmall.com,nil
5,idecrea.es,nil
1 1 google.com true
2 2 youtube.com false
3 4 tmall.com nil
4 5 idecrea.es nil

View File

@ -2,12 +2,13 @@
(:require
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.java.shell :as shell]) (:gen-class))
[clojure.java.shell :as shell]
) (:gen-class))
(defn wordpress?
"Check if a web page is generated with WordPress"
(defn request
"Make a request by means of curl"
[url]
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
(shell/sh "curl" "-L" "--max-time" "5" "-H" "User-Agent: Firefox" url))
(defn read-csv-domains
"Read CSV file with all domains"
@ -15,38 +16,26 @@
(with-open [reader (io/reader (io/resource url))]
(doall (csv/read-csv reader))))
(defn get-chunk-list
"Cuts a list by the maximum number of fragments and returns the selected fragment."
[items-list chunk max-chunks]
(let [list-size (count items-list)
chunk-size (Math/ceil (/ list-size max-chunks))
chunk-start (int (* chunk chunk-size))
chunk-end (int (+ (* chunk chunk-size) chunk-size))]
(subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
(defn analyse-list-chunk
"Analyse only the given list one chunck"
[items-list chunk max-chunks]
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
(defn wordpress?
"Check if a web page is generated with WordPress"
[url]
(let [response (request url)]
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:out response))])))
(defn -main
[& args]
(let [;; Name of the file containing the CSV with the domains
file-csv (first args)
;; Number of threads to be executed.
number-of-threads (second args)
file-csv "top-1m.csv"
;; Get domains from CSV
domains-csv (vec (read-csv-domains file-csv))
;; Filters leaving those that have not been checked
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
;; List with domains with a boolean indicating if it is generate or not in WordPress
(prn "Start")
(dotimes [i number-of-threads] (.start (Thread. (fn []
(analyse-list-chunk domains-uncheked i number-of-threads)))))
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
(prn "Complete")))