diff --git a/project.clj b/project.clj index 8b35619..f7833b9 100644 --- a/project.clj +++ b/project.clj @@ -1,11 +1,10 @@ -(defproject wordpress-used "0.1.0-SNAPSHOT" - :description "FIXME: write description" +(defproject wordpress-used "1.0.0-SNAPSHOT" + :description "Calculates WordPress usage index from a CSV list of domains" :url "http://example.com/FIXME" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} :dependencies [[org.clojure/clojure "1.10.0"] [clj-http "3.10.0"] - [org.clojure/data.csv "0.1.4"] - ] + [org.clojure/data.csv "0.1.4"]] :main ^:skip-aot wordpress-used.core :repl-options {:init-ns wordpress-used.core}) diff --git a/resources/top-1m-test.csv b/resources/top-1m-test.csv index e546aea..911c049 100644 --- a/resources/top-1m-test.csv +++ b/resources/top-1m-test.csv @@ -1,20 +1,4 @@ 1,google.com 2,youtube.com -3,baidu.com 4,tmall.com -5,qq.com -6,taobao.com -7,sohu.com -8,facebook.com -9,wikipedia.org -10,yahoo.com -11,login.tmall.com -12,amazon.com -13,360.cn -14,jd.com -15,weibo.com -16,sina.com.cn -17,live.com -18,reddit.com -19,pages.tmall.com -20,vk.com +5,idecrea.es \ No newline at end of file diff --git a/src/wordpress_used/core.clj b/src/wordpress_used/core.clj index be513dc..2cec18d 100644 --- a/src/wordpress_used/core.clj +++ b/src/wordpress_used/core.clj @@ -5,23 +5,45 @@ [clojure.java.io :as io] ) (:gen-class)) +(defn read-csv-domains + "Read CSV file with all domains" + [url] + (with-open [reader (io/reader (io/resource url))] + (doall (csv/read-csv reader)))) + +(defn save-csv-domains + "Save the list with the domains in a CSV file" + [url new-domains] + (with-open [writer (io/writer url)] + (csv/write-csv writer new-domains))) + (defn wordpress? - "Check site used WordPress with meta generator" + "Check if a web page is generated with WordPress" [url] (let [response (client/get (str "http://" url "/") {:ignore-unknown-host? true, :connection-timeout 5000, :throw-exceptions false})] (every? identity [(re-find (re-pattern "meta.*generator.*WordPress") (:body response))]))) - (defn -main [& args] - ;; Read CSV with all domains - (with-open [reader (io/reader (clojure.java.io/resource "top-1m-test.csv"))] - (doall - (let [domains (csv/read-csv reader) - ;; Check is WordPress - domains-with-wordpress (doall (map #(conj % (wordpress? (get % 1))) domains))] - ;;domains-with-wordpress (map #(conj % (wordpress? (get % 1))) domains)] - ;; Save CSV - (with-open [writer (io/writer (clojure.java.io/resource "top-1m-test.csv"))] - (csv/write-csv writer (vec domains-with-wordpress))) - )))) + (let [;; Name of the file containing the CSV with the domains + file-csv "top-1m-test.csv" + ;; List with domains + domains (read-csv-domains file-csv) + ;; List with domains with a boolean indicating if it is generate or not in WordPress + domains-checks (doall (vec (map #(conj % (wordpress? (get % 1))) domains)))] + ;; Save domains to CSV + (save-csv-domains file-csv domains-checks))) + +;; (defn -main +;; [& args] +;; ;; Read CSV with all domains +;; (with-open [reader (io/reader (clojure.java.io/resource "top-1m-test.csv"))] +;; (doall +;; (let [domains (csv/read-csv reader) +;; ;; Check is WordPress +;; domains-with-wordpress (vec (map #(conj % (wordpress? (get % 1))) domains))] + +;; ;; Save CSV +;; (with-open [writer (io/writer (clojure.java.io/resource "top-1m-test.csv"))] +;; (csv/write-csv writer (doall domains-with-wordpress))) +;; )))) diff --git a/top-1m-test.csv b/top-1m-test.csv new file mode 100644 index 0000000..6b502c4 --- /dev/null +++ b/top-1m-test.csv @@ -0,0 +1,4 @@ +1,google.com,false +2,youtube.com,false +4,tmall.com,false +5,idecrea.es,true