Compare commits

...

23 Commits
curl ... master

Author SHA1 Message Date
Andros Fenollosa
215b736472 Add SQLite version 2021 2021-09-28 12:34:15 +02:00
Andros Fenollosa
fa9fa1dca6 Create 2021-top-1m.csv 2021-09-27 18:53:02 +02:00
Andros Fenollosa
3410be52fa Compress data 2021-09-27 18:53:01 +02:00
Andros Fenollosa
cfb7f1dcb9 Add cores 2021-09-27 18:49:54 +02:00
Andros Fenollosa
ffa15e4178
Update README.md 2020-01-09 00:03:16 +01:00
Andros Fenollosa
9de202f90c Rename 2020-01-09 00:02:12 +01:00
Andros Fenollosa
8f7a752e47
Add files via upload 2020-01-08 23:59:44 +01:00
Andros Fenollosa
5e40532a5f
Update README.md 2019-12-04 19:19:00 +01:00
Andros Fenollosa
533100b43c
Update README.md 2019-12-04 19:15:27 +01:00
Andros Fenollosa
e6eef9b5e9 Add repo is-wordpress 2019-12-04 19:10:34 +01:00
Andros Fenollosa
91584d565f
Delete is-wordpress.sh 2019-12-01 11:02:22 +01:00
Andros Fenollosa
0329d875e8
Create is-wordpress.sh 2019-12-01 10:32:39 +01:00
Andros Fenollosa
af0a771645
Update README.md 2019-11-03 11:06:13 +01:00
Andros Fenollosa
d42965972c
Update Makefile 2019-11-03 10:40:35 +01:00
Andros Fenollosa
5aa344aa2d
Update README.md 2019-11-02 16:15:44 +01:00
Andros Fenollosa
34a380dd19
Add files via upload 2019-10-31 18:22:02 +01:00
Andros Fenollosa
f15acb849b
Delete top-1m-test.csv 2019-10-31 18:00:50 +01:00
Andros Fenollosa
d9de62f265
Update README.md 2019-10-31 17:56:09 +01:00
Andros Fenollosa
9572c4bdcd
Rename calcular-porcentaje.sh to calculate-percentage.sh 2019-10-31 17:55:51 +01:00
Andros Fenollosa
1cb66fb794
Update README.md 2019-10-31 17:54:27 +01:00
Andros Fenollosa
63f74932c5
Script calculate 2019-10-31 17:44:48 +01:00
Andros Fenollosa
5cff50d1d9
Update core.clj 2019-10-23 19:58:17 +02:00
Andros Fenollosa
30e0110cb7
Update README.md 2019-10-23 19:50:50 +02:00
12 changed files with 2225057 additions and 40 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "is-wordpress"]
path = is-wordpress
url = git@github.com:tanrax/is-wordpress.git

898391
2019-top-1m.csv Normal file

File diff suppressed because it is too large Load Diff

888703
2020-top-1m.csv Normal file

File diff suppressed because it is too large Load Diff

437884
2021-top-1m.csv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

BIN
2021-tranco-5.5m.tar.bz2 Normal file

Binary file not shown.

View File

@ -9,6 +9,7 @@ prepare: ## Download and format csv with domains
unzip top-1m.csv.zip unzip top-1m.csv.zip
rm top-1m.csv.zip rm top-1m.csv.zip
cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
mkdir -p resources
mv top-1m-temp.csv resources/top-1m.csv mv top-1m-temp.csv resources/top-1m.csv
rm top-1m.csv rm top-1m.csv

View File

@ -1,7 +1,17 @@
# Calculate WordPress usage worldwide
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
Warning that it can take a long time: between 20 to 30 days.
## Requirements
To run it you'll need either 2Gb of RAM or a swap file of the same size.
## Install ## Install
``` sh ``` sh
sudo apt install clojure leiningen wget make sudo apt install clojure leiningen wget make curl bash unzip
``` ```
## Prepare ## Prepare
@ -13,7 +23,31 @@ make prepare
## Run ## Run
``` sh ``` sh
make run lein run
``` ```
OutOfMemoryError: Java heap space When all the CSV sites are analyzed, you can see the final figure by running the following script
## Calculate percentage
``` sh
bash calculate-percentage.sh
```
## Historical
### 2019
19%
### 2020
29%
## Article with conclusions (in Spanish)
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
## Hacker News (comments)
https://news.ycombinator.com/item?id=21428149

8
calculate-percentage.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/bash
CSV="resources/top-1m.csv"
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
TOTAL=$(cat $CSV | wc -l)
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'

1
is-wordpress Submodule

@ -0,0 +1 @@
Subproject commit b64770eb63dcb3924957bed824e79d380089e2c2

View File

@ -1,4 +0,0 @@
1,google.com,true
2,youtube.com,false
4,tmall.com,nil
5,idecrea.es,nil
1 1 google.com true
2 2 youtube.com false
3 4 tmall.com nil
4 5 idecrea.es nil

View File

@ -1,26 +1,13 @@
(ns wordpress-used.core (ns wordpress-used.core
(:require (:require
[clj-http.client :as client]
[clojure.data.csv :as csv] [clojure.data.csv :as csv]
[clojure.java.io :as io] [clojure.java.io :as io]
[clojure.java.shell :as shell] [clojure.java.shell :as shell]) (:gen-class))
) (:gen-class))
(def headers {"User-Agent" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0" (defn wordpress?
"Accept" "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "Check if a web page is generated with WordPress"
"Accept-Language" "es,en-US;q=0.7,en;q=0.3" [url]
"Accept-Encoding" "gzip, deflate, br" (= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
"DNT" "1"
"Connection" "keep-alive"
"Upgrade-Insecure-Requests" "1"
"Pragma" "no-cache"
"Cache-Control" "no-cache"
"TE" "Trailers"})
(def http-config
{:headers headers
:ignore-unknown-host? true
:connection-timeout 5000
:throw-exceptions false})
(defn read-csv-domains (defn read-csv-domains
"Read CSV file with all domains" "Read CSV file with all domains"
@ -28,29 +15,38 @@
(with-open [reader (io/reader (io/resource url))] (with-open [reader (io/reader (io/resource url))]
(doall (csv/read-csv reader)))) (doall (csv/read-csv reader))))
(defn wordpress? (defn get-chunk-list
"Check if a web page is generated with WordPress" "Cuts a list by the maximum number of fragments and returns the selected fragment."
[url] [items-list chunk max-chunks]
(try (let [list-size (count items-list)
(let [response (client/get (str "http://" url "/") http-config)] chunk-size (Math/ceil (/ list-size max-chunks))
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))])) chunk-start (int (* chunk chunk-size))
(catch Exception e chunk-end (int (+ (* chunk chunk-size) chunk-size))]
"timeout"))) (subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
(defn analyse-list-chunk
"Analyse only the given list one chunck"
[items-list chunk max-chunks]
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
url (get domain-data 1)]
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
(defn -main (defn -main
[& args] [& args]
(let [;; Name of the file containing the CSV with the domains (let [;; Name of the file containing the CSV with the domains
file-csv "top-1m.csv" file-csv (first args)
;; Number of threads to be executed.
number-of-threads (second args)
;; Get domains from CSV ;; Get domains from CSV
domains-csv (vec (read-csv-domains file-csv)) domains-csv (vec (read-csv-domains file-csv))
;; Filters leaving those that have not been checked ;; Filters leaving those that have not been checked
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))] domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
;; List with domains with a boolean indicating if it is generate or not in WordPress ;; List with domains with a boolean indicating if it is generate or not in WordPress
(prn "Start") (prn "Start")
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0) (dotimes [i number-of-threads] (.start (Thread. (fn []
url (get domain-data 1)] (analyse-list-chunk domains-uncheked i number-of-threads)))))
;; Show info
(prn (str line " " url))
;; Edit domains-csv with check WordPress
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
(prn "Complete"))) (prn "Complete")))