Add cores
This commit is contained in:
parent
ffa15e4178
commit
cfb7f1dcb9
@ -2,36 +2,51 @@
|
|||||||
(:require
|
(:require
|
||||||
[clojure.data.csv :as csv]
|
[clojure.data.csv :as csv]
|
||||||
[clojure.java.io :as io]
|
[clojure.java.io :as io]
|
||||||
[clojure.java.shell :as shell]
|
[clojure.java.shell :as shell]) (:gen-class))
|
||||||
) (:gen-class))
|
|
||||||
|
|
||||||
(defn wordpress?
|
(defn wordpress?
|
||||||
"Check if a web page is generated with WordPress"
|
"Check if a web page is generated with WordPress"
|
||||||
[url]
|
[url]
|
||||||
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
|
(= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true"))
|
||||||
|
|
||||||
|
|
||||||
(defn read-csv-domains
|
(defn read-csv-domains
|
||||||
"Read CSV file with all domains"
|
"Read CSV file with all domains"
|
||||||
[url]
|
[url]
|
||||||
(with-open [reader (io/reader (io/resource url))]
|
(with-open [reader (io/reader (io/resource url))]
|
||||||
(doall (csv/read-csv reader))))
|
(doall (csv/read-csv reader))))
|
||||||
|
|
||||||
|
(defn get-chunk-list
|
||||||
|
"Cuts a list by the maximum number of fragments and returns the selected fragment."
|
||||||
|
[items-list chunk max-chunks]
|
||||||
|
(let [list-size (count items-list)
|
||||||
|
chunk-size (Math/ceil (/ list-size max-chunks))
|
||||||
|
chunk-start (int (* chunk chunk-size))
|
||||||
|
chunk-end (int (+ (* chunk chunk-size) chunk-size))]
|
||||||
|
(subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end))))
|
||||||
|
|
||||||
|
(defn analyse-list-chunk
|
||||||
|
"Analyse only the given list one chunck"
|
||||||
|
[items-list chunk max-chunks]
|
||||||
|
(doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0)
|
||||||
|
url (get domain-data 1)]
|
||||||
|
;; Show info
|
||||||
|
(prn (str line " " url))
|
||||||
|
;; Edit domains-csv with check WordPress
|
||||||
|
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))))
|
||||||
|
|
||||||
(defn -main
|
(defn -main
|
||||||
[& args]
|
[& args]
|
||||||
(let [;; Name of the file containing the CSV with the domains
|
(let [;; Name of the file containing the CSV with the domains
|
||||||
file-csv "top-1m.csv"
|
file-csv (first args)
|
||||||
|
;; Number of threads to be executed.
|
||||||
|
number-of-threads (second args)
|
||||||
;; Get domains from CSV
|
;; Get domains from CSV
|
||||||
domains-csv (vec (read-csv-domains file-csv))
|
domains-csv (vec (read-csv-domains file-csv))
|
||||||
;; Filters leaving those that have not been checked
|
;; Filters leaving those that have not been checked
|
||||||
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
|
domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))]
|
||||||
;; List with domains with a boolean indicating if it is generate or not in WordPress
|
;; List with domains with a boolean indicating if it is generate or not in WordPress
|
||||||
(prn "Start")
|
(prn "Start")
|
||||||
(doseq [domain-data domains-unchecked] (let [line (get domain-data 0)
|
(dotimes [i number-of-threads] (.start (Thread. (fn []
|
||||||
url (get domain-data 1)]
|
(analyse-list-chunk domains-uncheked i number-of-threads)))))
|
||||||
;; Show info
|
|
||||||
(prn (str line " " url))
|
|
||||||
;; Edit domains-csv with check WordPress
|
|
||||||
(shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))
|
|
||||||
(prn "Complete")))
|
(prn "Complete")))
|
||||||
|
Loading…
Reference in New Issue
Block a user