Compare commits

23 Commits
curl ... master

Author SHA1 Message Date
215b736472 Add SQLite version 2021 2021-09-28 12:34:15 +02:00
fa9fa1dca6 Create 2021-top-1m.csv 2021-09-27 18:53:02 +02:00
3410be52fa Compress data 2021-09-27 18:53:01 +02:00
cfb7f1dcb9 Add cores 2021-09-27 18:49:54 +02:00
ffa15e4178 Update README.md 2020-01-09 00:03:16 +01:00
9de202f90c Rename 2020-01-09 00:02:12 +01:00
8f7a752e47 Add files via upload 2020-01-08 23:59:44 +01:00
5e40532a5f Update README.md 2019-12-04 19:19:00 +01:00
533100b43c Update README.md 2019-12-04 19:15:27 +01:00
e6eef9b5e9 Add repo is-wordpress 2019-12-04 19:10:34 +01:00
91584d565f Delete is-wordpress.sh 2019-12-01 11:02:22 +01:00
0329d875e8 Create is-wordpress.sh 2019-12-01 10:32:39 +01:00
af0a771645 Update README.md 2019-11-03 11:06:13 +01:00
d42965972c Update Makefile 2019-11-03 10:40:35 +01:00
5aa344aa2d Update README.md 2019-11-02 16:15:44 +01:00
34a380dd19 Add files via upload 2019-10-31 18:22:02 +01:00
f15acb849b Delete top-1m-test.csv 2019-10-31 18:00:50 +01:00
d9de62f265 Update README.md 2019-10-31 17:56:09 +01:00
9572c4bdcd Rename calcular-porcentaje.sh to calculate-percentage.sh 2019-10-31 17:55:51 +01:00
1cb66fb794 Update README.md 2019-10-31 17:54:27 +01:00
63f74932c5 Script calculate 2019-10-31 17:44:48 +01:00
5cff50d1d9 Update core.clj 2019-10-23 19:58:17 +02:00
30e0110cb7 Update README.md 2019-10-23 19:50:50 +02:00
12 changed files with 2225057 additions and 40 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "is-wordpress"]
path = is-wordpress
url = git@github.com:tanrax/is-wordpress.git

898391
2019-top-1m.csv Normal file

File diff suppressed because it is too large Load Diff

888703
2020-top-1m.csv Normal file

File diff suppressed because it is too large Load Diff

437884
2021-top-1m.csv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

BIN
2021-tranco-5.5m.tar.bz2 Normal file

Binary file not shown.

View File

@ -9,6 +9,7 @@ prepare: ## Download and format csv with domains
unzip top-1m.csv.zip
rm top-1m.csv.zip
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
mkdir -p resources
mv top-1m-temp.csv resources/top-1m.csv
rm top-1m.csv

View File

@ -1,7 +1,17 @@
# Calculate WordPress usage worldwide
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
Warning that it can take a long time: between 20 to 30 days.
## Requirements
To run it you'll need either 2Gb of RAM or a swap file of the same size.
## Install
``` sh
sudo apt install clojure leiningen wget make
sudo apt install clojure leiningen wget make curl bash unzip
```
## Prepare
@ -13,7 +23,31 @@ make prepare
## Run
``` sh
make run
lein run
```
OutOfMemoryError: Java heap space
When all the CSV sites are analyzed, you can see the final figure by running the following script
## Calculate percentage
``` sh
bash calculate-percentage.sh
```
## Historical
### 2019
19%
### 2020
29%
## Article with conclusions (in Spanish)
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
## Hacker News (comments)
https://news.ycombinator.com/item?id=21428149

8
calculate-percentage.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/bash
CSV="resources/top-1m.csv"
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
TOTAL=$(cat $CSV | wc -l)
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'

1
is-wordpress Submodule

Submodule is-wordpress added at b64770eb63

View File

@ -1,4 +0,0 @@
1,google.com,true
2,youtube.com,false
4,tmall.com,nil
5,idecrea.es,nil
1 1 google.com true
2 2 youtube.com false
3 4 tmall.com nil
4 5 idecrea.es nil

View File

@ -1,26 +1,13 @@
(ns wordpress-used.core
(:require
[clj-http.client :as client]
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.java.shell :as shell]
) (:gen-class))
[clojure.java.shell :as shell]) (:gen-class))
(def headers {"User-Agent" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"
"Accept" "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
"Accept-Language" "es,en-US;q=0.7,en;q=0.3"
"Accept-Encoding" "gzip, deflate, br"
"DNT" "1"
"Connection" "keep-alive"
"Upgrade-Insecure-Requests" "1"
"Pragma" "no-cache"
"Cache-Control" "no-cache"
"TE" "Trailers"})
(def http-config
{:headers headers
:ignore-unknown-host? true
:connection-timeout 5000
:throw-exceptions false})
(defn wordpress?
"Check if a web page is generated with WordPress"
[url]
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
(defn read-csv-domains
"Read CSV file with all domains"
@ -28,29 +15,38 @@
(with-open [reader (io/reader (io/resource url))]
(doall (csv/read-csv reader))))
(defn wordpress?
"Check if a web page is generated with WordPress"
[url]
(try
(let [response (client/get (str "http://" url "/") http-config)]
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))]))
(catch Exception e
"timeout")))
(defn get-chunk-list
"Cuts a list by the maximum number of fragments and returns the selected fragment."
[items-list chunk max-chunks]
(let [list-size (count items-list)
chunk-size (Math/ceil (/ list-size max-chunks))
chunk-start (int (* chunk chunk-size))
chunk-end (int (+ (* chunk chunk-size) chunk-size))]
(subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
(defn analyse-list-chunk
"Analyse only the given list one chunck"
[items-list chunk max-chunks]
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
(defn -main
[& args]
(let [;; Name of the file containing the CSV with the domains
file-csv "top-1m.csv"
file-csv (first args)
;; Number of threads to be executed.
number-of-threads (second args)
;; Get domains from CSV
domains-csv (vec (read-csv-domains file-csv))
;; Filters leaving those that have not been checked
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
;; List with domains with a boolean indicating if it is generate or not in WordPress
(prn "Start")
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
(dotimes [i number-of-threads] (.start (Thread. (fn []
(analyse-list-chunk domains-uncheked i number-of-threads)))))
(prn "Complete")))