From cfb7f1dcb9e63108232528bf61cf5b3ddf8d3e4b Mon Sep 17 00:00:00 2001 From: Andros Fenollosa Date: Mon, 27 Sep 2021 18:37:34 +0200 Subject: [PATCH] Add cores --- src/wordpress_used/core.clj | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/wordpress_used/core.clj b/src/wordpress_used/core.clj index 45664ce..9884430 100644 --- a/src/wordpress_used/core.clj +++ b/src/wordpress_used/core.clj @@ -2,36 +2,51 @@ (:require [clojure.data.csv :as csv] [clojure.java.io :as io] - [clojure.java.shell :as shell] - ) (:gen-class)) + [clojure.java.shell :as shell]) (:gen-class)) (defn wordpress? "Check if a web page is generated with WordPress" [url] (= (clojure.string/trim-newline (:out (shell/sh "bash" "./is-wordpress/is-wordpress" url))) "true")) - (defn read-csv-domains "Read CSV file with all domains" [url] (with-open [reader (io/reader (io/resource url))] (doall (csv/read-csv reader)))) +(defn get-chunk-list + "Cuts a list by the maximum number of fragments and returns the selected fragment." + [items-list chunk max-chunks] + (let [list-size (count items-list) + chunk-size (Math/ceil (/ list-size max-chunks)) + chunk-start (int (* chunk chunk-size)) + chunk-end (int (+ (* chunk chunk-size) chunk-size))] + (subvec items-list chunk-start (if (>= chunk-end list-size) list-size chunk-end)))) + +(defn analyse-list-chunk + "Analyse only the given list one chunck" + [items-list chunk max-chunks] + (doseq [domain-data (get-chunk-list items-list chunk max-chunks)] (let [line (get domain-data 0) + url (get domain-data 1)] + ;; Show info + (prn (str line " " url)) + ;; Edit domains-csv with check WordPress + (shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv))))) (defn -main [& args] (let [;; Name of the file containing the CSV with the domains - file-csv "top-1m.csv" + file-csv (first args) + ;; Number of threads to be executed. + number-of-threads (second args) ;; Get domains from CSV domains-csv (vec (read-csv-domains file-csv)) ;; Filters leaving those that have not been checked domains-unchecked (vec (filter #(= (get % 2) "nil") domains-csv))] ;; List with domains with a boolean indicating if it is generate or not in WordPress (prn "Start") - (doseq [domain-data domains-unchecked] (let [line (get domain-data 0) - url (get domain-data 1)] - ;; Show info - (prn (str line " " url)) - ;; Edit domains-csv with check WordPress - (shell/sh "sed" "-i" (str line "s/nil/" (wordpress? url) "/g") (str "resources/" file-csv)))) + (dotimes [i number-of-threads] (.start (Thread. (fn [] + (analyse-list-chunk domains-uncheked i number-of-threads))))) + (prn "Complete")))