calculate-wordpress-usage/src/wordpress_used/core.clj

44 lines
2.0 KiB
Clojure
Raw Normal View History

2019-10-07 19:35:09 +02:00
(ns wordpress-used.core
(:require
[clj-http.client :as client]
[clojure.data.csv :as csv]
[clojure.java.io :as io]
2019-10-10 00:37:22 +02:00
[clojure.java.shell :refer [sh]]
2019-10-07 19:35:09 +02:00
) (:gen-class))
2019-10-07 23:46:00 +02:00
(defn read-csv-domains
"Read CSV file with all domains"
[url]
(with-open [reader (io/reader (io/resource url))]
(doall (csv/read-csv reader))))
(defn save-csv-domains
"Save the list with the domains in a CSV file"
[url new-domains]
2019-10-09 19:53:37 +02:00
(with-open [writer (io/writer (io/resource url))]
2019-10-07 23:46:00 +02:00
(csv/write-csv writer new-domains)))
2019-10-07 19:35:09 +02:00
(defn wordpress?
2019-10-07 23:46:00 +02:00
"Check if a web page is generated with WordPress"
2019-10-07 19:35:09 +02:00
[url]
(let [response (client/get (str "http://" url "/") {:ignore-unknown-host? true, :connection-timeout 5000, :throw-exceptions false})]
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))])))
(defn -main
[& args]
2019-10-07 23:46:00 +02:00
(let [;; Name of the file containing the CSV with the domains
2019-10-09 19:50:14 +02:00
file-csv "top-1m-test.csv"
;; Get domains from CSV
domains-csv (vec (read-csv-domains file-csv))
;; Filters leaving those that have not been checked
domains-unchecked (filter #(= (get % 2) "nil") domains-csv)]
;; List with domains with a boolean indicating if it is generate or not in WordPress
2019-10-10 00:37:22 +02:00
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
domain (get domain-data 1)
2019-10-09 19:50:14 +02:00
;; Check if domain it is generate or not in WordPress
check-wordpress (wordpress? domain)]
;; Edit domains-csv with check WordPress
2019-10-10 00:37:22 +02:00
(prn (str line " " domain " " check-wordpress))
(prn (sh "sed" "-i" "1s/b/o/g" (str "resources/" file-csv)))))))
;; (prn (sh "sed" "-i" (str "'" line "s/.*/" line "," domain "," check-wordpress "/g'") (str "resources/" file-csv)))))))