Compare commits
	
		
			2 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					8f09d77647 | ||
| 
						 | 
					de49bade47 | 
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							@@ -1,3 +0,0 @@
 | 
				
			|||||||
[submodule "is-wordpress"]
 | 
					 | 
				
			||||||
	path = is-wordpress
 | 
					 | 
				
			||||||
	url = git@github.com:tanrax/is-wordpress.git
 | 
					 | 
				
			||||||
							
								
								
									
										898391
									
								
								2019-top-1m.csv
									
									
									
									
									
								
							
							
						
						
									
										898391
									
								
								2019-top-1m.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										888703
									
								
								2020-top-1m.csv
									
									
									
									
									
								
							
							
						
						
									
										888703
									
								
								2020-top-1m.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										437884
									
								
								2021-top-1m.csv
									
									
									
									
									
								
							
							
						
						
									
										437884
									
								
								2021-top-1m.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										4
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								Makefile
									
									
									
									
									
								
							@@ -9,9 +9,11 @@ prepare: ## Download and format csv with domains
 | 
				
			|||||||
		unzip top-1m.csv.zip
 | 
							unzip top-1m.csv.zip
 | 
				
			||||||
		rm top-1m.csv.zip
 | 
							rm top-1m.csv.zip
 | 
				
			||||||
		cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
 | 
							cat top-1m.csv | sed -n 's/$$/,nil/p' >> top-1m-temp.csv
 | 
				
			||||||
		mkdir -p resources
 | 
					 | 
				
			||||||
		mv top-1m-temp.csv resources/top-1m.csv
 | 
							mv top-1m-temp.csv resources/top-1m.csv
 | 
				
			||||||
		rm top-1m.csv
 | 
							rm top-1m.csv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
run: ## Run checks 
 | 
					run: ## Run checks 
 | 
				
			||||||
		lein run
 | 
							lein run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					calculate: ## Calculate the percentage
 | 
				
			||||||
 | 
							(awk 'END {print NR}' resources/top-1m.csv) / (cat resources/top-1m.csv | grep ',true' | wc -l)
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										40
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										40
									
								
								README.md
									
									
									
									
									
								
							@@ -1,17 +1,7 @@
 | 
				
			|||||||
# Calculate WordPress usage worldwide
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
The following script will analyze the list of the first million domains with the most visits to give you the percentage of use.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Warning that it can take a long time: between 20 to 30 days.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## Requirements
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
To run it you'll need either 2Gb of RAM or a swap file of the same size.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## Install
 | 
					## Install
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``` sh
 | 
					``` sh
 | 
				
			||||||
sudo apt install clojure leiningen wget make curl bash unzip
 | 
					sudo apt install clojure leiningen wget make
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Prepare
 | 
					## Prepare
 | 
				
			||||||
@@ -23,31 +13,7 @@ make prepare
 | 
				
			|||||||
## Run
 | 
					## Run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``` sh
 | 
					``` sh
 | 
				
			||||||
lein run
 | 
					make run
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When all the CSV sites are analyzed, you can see the final figure by running the following script
 | 
					OutOfMemoryError: Java heap space
 | 
				
			||||||
 | 
					 | 
				
			||||||
## Calculate percentage
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` sh
 | 
					 | 
				
			||||||
bash calculate-percentage.sh
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## Historical
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### 2019
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
19%
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### 2020
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
29%
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## Article with conclusions (in Spanish)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
https://programadorwebvalencia.com/analizando-un-millon-de-paginas-para-saber-cuanto-se-usa-wordpress-2019/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## Hacker News (comments)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
https://news.ycombinator.com/item?id=21428149
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,8 +0,0 @@
 | 
				
			|||||||
#!/bin/bash
 | 
					 | 
				
			||||||
CSV="resources/top-1m.csv"
 | 
					 | 
				
			||||||
NUM_WORDPRESS=$(cat $CSV | grep ,true$ | wc -l)
 | 
					 | 
				
			||||||
NUM_NOT_WORDPRESS=$(cat $CSV | grep ,false$ | wc -l)
 | 
					 | 
				
			||||||
NUM_TIMEOUT=$(cat $CSV | grep ,timeout$ | wc -l)
 | 
					 | 
				
			||||||
TOTAL=$(cat $CSV | wc -l)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
echo "($NUM_WORDPRESS * 100) / ($TOTAL - $NUM_TIMEOUT)" | bc | awk '{print $1"%"}'
 | 
					 | 
				
			||||||
 Submodule is-wordpress deleted from b64770eb63
									
								
							@@ -3,8 +3,7 @@
 | 
				
			|||||||
  :url "http://example.com/FIXME"
 | 
					  :url "http://example.com/FIXME"
 | 
				
			||||||
  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
 | 
					  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
 | 
				
			||||||
            :url  "https://www.eclipse.org/legal/epl-2.0/"}
 | 
					            :url  "https://www.eclipse.org/legal/epl-2.0/"}
 | 
				
			||||||
  :dependencies [[org.clojure/clojure "1.10.0"]
 | 
					  :dependencies [[org.clojure/clojure "1.10.1"]
 | 
				
			||||||
                 [clj-http "3.10.0"]
 | 
					 | 
				
			||||||
                 [org.clojure/data.csv "0.1.4"]]
 | 
					                 [org.clojure/data.csv "0.1.4"]]
 | 
				
			||||||
  :jvm-opts ["-Xmx1G"]
 | 
					  :jvm-opts ["-Xmx1G"]
 | 
				
			||||||
  :main ^:skip-aot wordpress-used.core
 | 
					  :main ^:skip-aot wordpress-used.core
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										4
									
								
								resources/top-1m-test.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								resources/top-1m-test.csv
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,4 @@
 | 
				
			|||||||
 | 
					1,google.com,true
 | 
				
			||||||
 | 
					2,youtube.com,false
 | 
				
			||||||
 | 
					4,tmall.com,nil
 | 
				
			||||||
 | 
					5,idecrea.es,nil
 | 
				
			||||||
		
		
			
  | 
@@ -2,12 +2,13 @@
 | 
				
			|||||||
  (:require
 | 
					  (:require
 | 
				
			||||||
   [clojure.data.csv :as csv]
 | 
					   [clojure.data.csv :as csv]
 | 
				
			||||||
   [clojure.java.io :as io]
 | 
					   [clojure.java.io :as io]
 | 
				
			||||||
   [clojure.java.shell :as shell]) (:gen-class))
 | 
					   [clojure.java.shell :as shell]
 | 
				
			||||||
 | 
					   ) (:gen-class))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn wordpress?
 | 
					(defn request
 | 
				
			||||||
  "Check if a web page is generated with WordPress"
 | 
					  "Make a request by means of curl"
 | 
				
			||||||
  [url]
 | 
					  [url]
 | 
				
			||||||
  (= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
 | 
					  (shell/sh "curl" "-L" "--max-time" "5" "-H" "User-Agent: Firefox" url))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn read-csv-domains
 | 
					(defn read-csv-domains
 | 
				
			||||||
  "Read CSV file with all domains"
 | 
					  "Read CSV file with all domains"
 | 
				
			||||||
@@ -15,38 +16,26 @@
 | 
				
			|||||||
  (with-open [reader (io/reader (io/resource url))]
 | 
					  (with-open [reader (io/reader (io/resource url))]
 | 
				
			||||||
    (doall (csv/read-csv reader))))
 | 
					    (doall (csv/read-csv reader))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn get-chunk-list
 | 
					(defn wordpress?
 | 
				
			||||||
  "Cuts a list by the maximum number of fragments and returns the selected fragment."
 | 
					  "Check if a web page is generated with WordPress"
 | 
				
			||||||
  [items-list chunk max-chunks]
 | 
					  [url]
 | 
				
			||||||
  (let [list-size   (count items-list)
 | 
					  (let [response (request url)]
 | 
				
			||||||
        chunk-size  (Math/ceil (/ list-size max-chunks))
 | 
					    (every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:out response))])))
 | 
				
			||||||
        chunk-start (int (* chunk chunk-size))
 | 
					 | 
				
			||||||
        chunk-end   (int (+ (* chunk chunk-size) chunk-size))]
 | 
					 | 
				
			||||||
    (subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
(defn analyse-list-chunk
 | 
					 | 
				
			||||||
  "Analyse only the given list one chunck"
 | 
					 | 
				
			||||||
  [items-list chunk max-chunks]
 | 
					 | 
				
			||||||
  (doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
 | 
					 | 
				
			||||||
                                                                          url  (get domain-data 1)]
 | 
					 | 
				
			||||||
                                                                      ;; Show info
 | 
					 | 
				
			||||||
                                                                      (prn (str line " " url))
 | 
					 | 
				
			||||||
                                                                      ;; Edit domains-csv with check WordPress
 | 
					 | 
				
			||||||
                                                                      (shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn -main
 | 
					(defn -main
 | 
				
			||||||
  [& args]
 | 
					  [& args]
 | 
				
			||||||
  (let [;; Name of the file containing the CSV with the domains
 | 
					  (let [;; Name of the file containing the CSV with the domains
 | 
				
			||||||
        file-csv          (first args)
 | 
					        file-csv          "top-1m.csv"
 | 
				
			||||||
        ;; Number of threads to be executed.
 | 
					 | 
				
			||||||
        number-of-threads (second args)
 | 
					 | 
				
			||||||
        ;; Get domains from CSV
 | 
					        ;; Get domains from CSV
 | 
				
			||||||
        domains-csv       (vec (read-csv-domains file-csv))
 | 
					        domains-csv       (vec (read-csv-domains file-csv))
 | 
				
			||||||
        ;; Filters leaving those that have not been checked
 | 
					        ;; Filters leaving those that have not been checked
 | 
				
			||||||
        domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
 | 
					        domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
 | 
				
			||||||
    ;; List with domains with a boolean indicating if it is generate or not in WordPress
 | 
					    ;; List with domains with a boolean indicating if it is generate or not in WordPress
 | 
				
			||||||
    (prn "Start")
 | 
					    (prn "Start")
 | 
				
			||||||
    (dotimes [i number-of-threads] (.start (Thread. (fn []
 | 
					    (doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
 | 
				
			||||||
                                                      (analyse-list-chunk domains-uncheked i number-of-threads)))))
 | 
					                                                 url  (get domain-data 1)]
 | 
				
			||||||
 | 
					                                             ;; Show info
 | 
				
			||||||
 | 
					                                             (prn (str line " " url))
 | 
				
			||||||
 | 
					                                             ;; Edit domains-csv with check WordPress
 | 
				
			||||||
 | 
					                                             (shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
 | 
				
			||||||
    (prn "Complete")))
 | 
					    (prn "Complete")))
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user