From 174abd807253ab3fd28f4e64a6792f609e0674b6 Mon Sep 17 00:00:00 2001 From: KKlochko Date: Wed, 13 Sep 2023 21:36:23 +0300 Subject: [PATCH] Add a simple language analyzers. --- project.clj | 4 +- .../language_analyzer.clj | 21 +++++++++ .../language_analyzer_test.clj | 45 +++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 src/cipher_analytical_machine/language_analyzer.clj create mode 100644 test/cipher_analytical_machine/language_analyzer_test.clj diff --git a/project.clj b/project.clj index 92a7fed..b612dd7 100644 --- a/project.clj +++ b/project.clj @@ -4,7 +4,9 @@ :license {:name "LGPL" :url "http://www.gnu.org/licenses/lgpl-3.0.txt"} :dependencies [[org.clojure/clojure "1.11.1"] - [org.clojure/tools.cli "1.0.219"]] + [org.clojure/tools.cli "1.0.219"] + [org.apache.tika/tika-core "1.28.5"] + [org.apache.tika/tika-langdetect "1.28.5"]] :main ^:skip-aot cipher-analytical-machine.core :target-path "target/%s" :profiles {:uberjar {:aot :all diff --git a/src/cipher_analytical_machine/language_analyzer.clj b/src/cipher_analytical_machine/language_analyzer.clj new file mode 100644 index 0000000..d3681f6 --- /dev/null +++ b/src/cipher_analytical_machine/language_analyzer.clj @@ -0,0 +1,21 @@ +(ns cipher-analytical-machine.language_analyzer + (:import [org.apache.tika.language LanguageIdentifier]) + (:gen-class)) + +(defn detect-language + "Return a identified language." + [text] + (-> (new LanguageIdentifier text) + .getLanguage)) + +(defn is-language? + "Check if the text written a language." + [text language-id] + (= (detect-language text) + language-id)) + +(defn is-nonsense? + "Check if the text written a language is not a language." + [text language-id] + (not (is-language? text language-id))) + diff --git a/test/cipher_analytical_machine/language_analyzer_test.clj b/test/cipher_analytical_machine/language_analyzer_test.clj new file mode 100644 index 0000000..523fc42 --- /dev/null +++ b/test/cipher_analytical_machine/language_analyzer_test.clj @@ -0,0 +1,45 @@ +(ns cipher-analytical-machine.language_analyzer_test + (:require + [clojure.test :refer :all] + [cipher-analytical-machine.language_analyzer :refer :all] + )) + +(deftest detect-language-test + (testing "Checking the English text" + (is (= "en" (detect-language "This is a sentence that uses English.")))) + + (testing "Checking the Ukrainian text" + (is (= "uk" (detect-language "Текст, що написаний українською.")))) + + (testing "Checking a gibberish that uses English letters" + (is (= "lt" (detect-language "dfgjldfjgdfl gjdfg idfjig hdgesr khs e.")))) + + (testing "Checking a gibberish that uses Ukrainian letters" + (is (= "uk" (detect-language "іаврпшшцді врадів аргів аріл варї йцґ."))))) + +(deftest is-language?-test + (testing "Checking the English text" + (is (is-language? "This is a sentence that uses English." "en"))) + + (testing "Checking the Ukrainian text" + (is (is-language? "Текст, що написаний українською." "uk"))) + + (testing "Checking a gibberish that uses English letters" + (is (is-language? "dfgjldfjgdfl gjdfg idfjig hdgesr khs e." "lt"))) + + (testing "Checking a gibberish that uses Ukrainian letters" + (is (is-language? "іаврпшшцді врадів аргів аріл варї йцґ." "uk")))) + +(deftest is-nonsense?-test + (testing "Checking the English text" + (is (not (is-nonsense? "This is a sentence that uses English." "en")))) + + (testing "Checking the Ukrainian text" + (is (not (is-nonsense? "Текст, що написаний українською." "uk")))) + + (testing "Checking a gibberish that uses English letters" + (is (is-nonsense? "dfgjldfjgdfl gjdfg idfjig hdgesr khs e." "en"))) + + (testing "Checking a gibberish that uses Ukrainian letters" + (is (is-nonsense? "іаврпшшцді врадів аргів аріл варї йцґ." "uk")))) +