Update project and core

This commit is contained in:
Andros Fenollosa 2019-10-07 23:46:00 +02:00
parent 6dd005c1f9
commit 612e713c33
4 changed files with 43 additions and 34 deletions

@ -1,11 +1,10 @@
(defproject wordpress-used "0.1.0-SNAPSHOT" (defproject wordpress-used "1.0.0-SNAPSHOT"
:description "FIXME: write description" :description "Calculates WordPress usage index from a CSV list of domains"
:url "http://example.com/FIXME" :url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"} :url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.10.0"] :dependencies [[org.clojure/clojure "1.10.0"]
[clj-http "3.10.0"] [clj-http "3.10.0"]
[org.clojure/data.csv "0.1.4"] [org.clojure/data.csv "0.1.4"]]
]
:main ^:skip-aot wordpress-used.core :main ^:skip-aot wordpress-used.core
:repl-options {:init-ns wordpress-used.core}) :repl-options {:init-ns wordpress-used.core})

@ -1,20 +1,4 @@
1,google.com 1,google.com
2,youtube.com 2,youtube.com
3,baidu.com
4,tmall.com 4,tmall.com
5,qq.com 5,idecrea.es
6,taobao.com
7,sohu.com
8,facebook.com
9,wikipedia.org
10,yahoo.com
11,login.tmall.com
12,amazon.com
13,360.cn
14,jd.com
15,weibo.com
16,sina.com.cn
17,live.com
18,reddit.com
19,pages.tmall.com
20,vk.com
1 1 google.com
2 2 youtube.com
3 baidu.com
3 4 tmall.com
4 5 qq.com idecrea.es
6 taobao.com
7 sohu.com
8 facebook.com
9 wikipedia.org
10 yahoo.com
11 login.tmall.com
12 amazon.com
13 360.cn
14 jd.com
15 weibo.com
16 sina.com.cn
17 live.com
18 reddit.com
19 pages.tmall.com
20 vk.com

@ -5,23 +5,45 @@
[clojure.java.io :as io] [clojure.java.io :as io]
) (:gen-class)) ) (:gen-class))
(defn read-csv-domains
"Read CSV file with all domains"
[url]
(with-open [reader (io/reader (io/resource url))]
(doall (csv/read-csv reader))))
(defn save-csv-domains
"Save the list with the domains in a CSV file"
[url new-domains]
(with-open [writer (io/writer url)]
(csv/write-csv writer new-domains)))
(defn wordpress? (defn wordpress?
"Check site used WordPress with meta generator" "Check if a web page is generated with WordPress"
[url] [url]
(let [response (client/get (str "http://" url "/") {:ignore-unknown-host? true, :connection-timeout 5000, :throw-exceptions false})] (let [response (client/get (str "http://" url "/") {:ignore-unknown-host? true, :connection-timeout 5000, :throw-exceptions false})]
(every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))]))) (every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))])))
(defn -main (defn -main
[& args] [& args]
;; Read CSV with all domains (let [;; Name of the file containing the CSV with the domains
(with-open [reader (io/reader (clojure.java.io/resource "top-1m-test.csv"))] file-csv "top-1m-test.csv"
(doall ;; List with domains
(let [domains (csv/read-csv reader) domains (read-csv-domains file-csv)
;; Check is WordPress ;; List with domains with a boolean indicating if it is generate or not in WordPress
domains-with-wordpress (doall (map #(conj % (wordpress? (get % 1))) domains))] domains-checks (doall (vec (map #(conj % (wordpress? (get % 1))) domains)))]
;;domains-with-wordpress (map #(conj % (wordpress? (get % 1))) domains)] ;; Save domains to CSV
;; Save CSV (save-csv-domains file-csv domains-checks)))
(with-open [writer (io/writer (clojure.java.io/resource "top-1m-test.csv"))]
(csv/write-csv writer (vec domains-with-wordpress))) ;; (defn -main
)))) ;; [& args]
;; ;; Read CSV with all domains
;; (with-open [reader (io/reader (clojure.java.io/resource "top-1m-test.csv"))]
;; (doall
;; (let [domains (csv/read-csv reader)
;; ;; Check is WordPress
;; domains-with-wordpress (vec (map #(conj % (wordpress? (get % 1))) domains))]
;; ;; Save CSV
;; (with-open [writer (io/writer (clojure.java.io/resource "top-1m-test.csv"))]
;; (csv/write-csv writer (doall domains-with-wordpress)))
;; ))))

4
top-1m-test.csv Normal file

@ -0,0 +1,4 @@
1,google.com,false
2,youtube.com,false
4,tmall.com,false
5,idecrea.es,true
1 1 google.com false
2 2 youtube.com false
3 4 tmall.com false
4 5 idecrea.es true