From 25383f48ba98d53d48e49a3d09cb3e62dd5e10d6 Mon Sep 17 00:00:00 2001 From: Greg Date: Sat, 25 Jun 2022 13:29:34 -0500 Subject: [PATCH 1/2] Created an app directory, renamed hs-conllu.cabal hs-conllu.cabal.original, added package.yaml, added stack.yaml --- app1/Main.hs | 30 +++++++++ hs-conllu.cabal => hs-conllu.cabal.original | 0 package.yaml | 71 +++++++++++++++++++++ stack.yaml | 67 +++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 app1/Main.hs rename hs-conllu.cabal => hs-conllu.cabal.original (100%) create mode 100644 package.yaml create mode 100644 stack.yaml diff --git a/app1/Main.hs b/app1/Main.hs new file mode 100644 index 0000000..cb20ef6 --- /dev/null +++ b/app1/Main.hs @@ -0,0 +1,30 @@ +-- | +-- Module : Main +-- Copyright : © 2018 bruno cuconato +-- License : LPGL-3 +-- +-- Maintainer : bruno cuconato +-- Stability : experimental +-- Portability : non-portable +-- +-- the @hs-conllu@ executable. + +module Main + ( main ) +where + +import Conllu.IO (readAndPrintConllu, diffConllu) + +import System.Environment + +main :: IO () +-- | @validate@ : read CoNLL-U file and print it to stdout. (this will +-- only apply the command to one file, so use your terminal's +-- completion mechanism to apply it to several files. +main = do + (c:as) <- getArgs + case c of + "validate" -> mapM_ readAndPrintConllu as + "diff" -> diffConllu (as !! 0) (as !! 1) + _ -> return () + \ No newline at end of file diff --git a/hs-conllu.cabal b/hs-conllu.cabal.original similarity index 100% rename from hs-conllu.cabal rename to hs-conllu.cabal.original diff --git a/package.yaml b/package.yaml new file mode 100644 index 0000000..50563bd --- /dev/null +++ b/package.yaml @@ -0,0 +1,71 @@ +name: hs-conllu +version: 0.1.6 +synopsis: Conllu validating parser and utils. +description: utilities to parse, print, diff, and analyse data in CoNLL-U format. +homepage: https://github.com/odanoburu/hs-conllu +bug-reports: https://github.com/odanoburu/hs-conllu/issues +license: LGPL-3.0-only +license-file: LICENSE +author: bruno cuconato +maintainer: bruno cuconato +copyright: 2021 bruno cuconato +build-type: Simple +extra-doc-files: + - README + - CHANGELOG +tested-with: ghc == 9.2.2 + +verbatim: + cabal-version: 2.0 + +ghc-options: + - -Wall + - -Wcompat + - -Widentities + - -Wincomplete-uni-patterns + - -Wincomplete-record-updates + +default-extensions: + - OverloadedStrings + - LambdaCase +# - NoImplicitPrelude + +dependencies: + - aeson + - base >= 4.15 && < 5 + - bytestring + - containers + - directory + - filepath + - megaparsec + - mtl + - optparse-applicative + - scientific + - string-interpolate + - text + - text-conversions + - text-show + - unordered-containers + - vector + +library: + source-dirs: src + other-modules: [] + exposed-modules: + - Conllu.DeprelTagset + - Conllu.Diff + - Conllu.IO + - Conllu.Parse + - Conllu.Print + - Conllu.Test + - Conllu.Type + - Conllu.UposTagset + - Conllu.Utils + +executables: + hs-conllu: + main: Main.hs + source-dirs: app1 + dependencies: + - hs-conllu + diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..e0f50e8 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,67 @@ +# This file was automatically generated by 'stack init' +# +# Some commonly used options have been documented as comments in this file. +# For advanced use and comprehensive documentation of the format, please see: +# https://docs.haskellstack.org/en/stable/yaml_configuration/ + +# Resolver to choose a 'specific' stackage snapshot or a compiler version. +# A snapshot resolver dictates the compiler version and the set of packages +# to be used for project dependencies. For example: +# +# resolver: lts-3.5 +# resolver: nightly-2015-09-21 +# resolver: ghc-7.10.2 +# +# The location of a snapshot can be provided as a file or url. Stack assumes +# a snapshot provided as a file might change, whereas a url resource does not. +# +# resolver: ./custom-snapshot.yaml +# resolver: https://example.com/snapshots/2018-01-01.yaml +resolver: nightly-2022-01-04 + # url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/16/31.yaml + +# User packages to be built. +# Various formats can be used as shown in the example below. +# +# packages: +# - some-directory +# - https://example.com/foo/bar/baz-0.0.2.tar.gz +# subdirs: +# - auto-update +# - wai +packages: +- . +# Dependency packages to be pulled from upstream that are not in the resolver. +# These entries can reference officially published versions as well as +# forks / in-progress versions pinned to a git hash. For example: +# +# extra-deps: +# - acme-missiles-0.3 +# - git: https://github.com/commercialhaskell/stack.git +# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a +# +# extra-deps: [] + +# Override default flag values for local packages and extra-deps +# flags: {} + +# Extra package databases containing global packages +# extra-package-dbs: [] + +# Control whether we use the GHC we find on the path +# system-ghc: true +# +# Require a specific version of stack, using version ranges +# require-stack-version: -any # Default +# require-stack-version: ">=2.7" +# +# Override the architecture used by stack, especially useful on Windows +# arch: i386 +# arch: x86_64 +# +# Extra directories used by stack for building +# extra-include-dirs: [/path/to/dir] +# extra-lib-dirs: [/path/to/dir] +# +# Allow a newer minor version of GHC than the snapshot specifies +# compiler-check: newer-minor From 02f2db6bf3fa0bd54ecd5b817b02b5dc66d0aedb Mon Sep 17 00:00:00 2001 From: Greg Date: Sat, 25 Jun 2022 13:30:54 -0500 Subject: [PATCH 2/2] Use text instead of string --- src/Conllu/DeprelTagset.hs | 108 +++++++++++++++++- src/Conllu/Diff.hs | 24 ++-- src/Conllu/IO.hs | 35 ++++-- src/Conllu/Parse.hs | 181 +++++++++++++++++++++++------- src/Conllu/Print.hs | 218 ++++++++++++++++--------------------- src/Conllu/Test.hs | 10 +- src/Conllu/Type.hs | 147 +++++++++++++++---------- src/Conllu/UposTagset.hs | 38 +++---- src/Conllu/Utils.hs | 27 ++++- 9 files changed, 516 insertions(+), 272 deletions(-) diff --git a/src/Conllu/DeprelTagset.hs b/src/Conllu/DeprelTagset.hs index 6a2b6d8..bcd107f 100644 --- a/src/Conllu/DeprelTagset.hs +++ b/src/Conllu/DeprelTagset.hs @@ -13,8 +13,14 @@ -- import qualified Conllu.DeprelTagset as D -- @ -module Conllu.DeprelTagset where +-- TODO create the UD Taxonomy Table and enhanced dependencies +-- < see https://universaldependencies.org/v2/conll-u.html> +-- TODO include modifier labels, e.g. passive verbs, relative cluses, etc. +-- TODO resolve deriving strategy +module Conllu.DeprelTagset +where +{- original declaration: data EP = REF -- ^ only allowed in DEPS | ACL @@ -54,4 +60,102 @@ data EP | ROOT | VOCATIVE | XCOMP - deriving (Enum, Eq, Read, Show) + deriving ( Show, Eq ) -- , Read,Enum ) It is not clear why Enum and Eq are required +-} + +-- Revised Universal Dependency Relations +-- TODO include subtyped relations and Structural Categories of the dependent +-- Universal Dependencies are represented lowercase +-- see https://universaldependencies.org/u/dep/index.html + +data EP = -- DEPREL = + REF -- ^ only allowed in DEPS + | ACL -- clausal modifier of noun (adnominal clause) +-- | ACL RELCL -- relative clause modifier + | ADVCL -- adverbial clause modifier + | ADVMOD -- adverbial modifier +-- | ADVMOD EMPH -- emphasizing word, intensifier +-- | ADVMOD LMOD -- locative adverbial modifier + | AMOD -- adjectival modifier + | APPOS -- appositional modifier + | AUX -- auxiliary +-- | AUX PASS -- passive auxiliary + | CASE -- case marking + | CC -- coordinating conjunction +-- | CC PRECONJ -- preconjunct + | CCOMP -- clausal complement + | CLF -- classifier + | COMPOUND -- compound +-- | COMPOUND LVC -- light verb construction +-- | COMPOUND PRT -- phrasal verb particle +-- | COMPOUND REDUP -- reduplicated compounds +-- | COMPOUND SVC -- serial verb compounds + | CONJ -- conjunct + | COP -- copula + | CSUBJ -- clausal subject + -- | CSUBJ PASS -- clausal passive subject + | DEP -- unspecified dependency + | DET -- determiner +-- | DET NUMGOV -- pronominal quantifier governing the case of the noun +-- | DET NUMMOD -- pronominal quantifier agreeing in case with the noun +-- | DET POSS -- possessive determiner + | DISCOURSE -- discourse element + | DISLOCATED -- dislocated elements + | EXPL -- expletive +-- | EXPL IMPERS -- impersonal expletive +-- | EXPL PASS -- reflexive pronoun used in reflexive passive +-- | EXPL PV -- reflexive clitic with an inherently reflexive verb + | FIXED -- fixed multiword expression + | FLAT -- flat multiword expression +-- | FLAT FOREIGN -- foreign words +-- | FLAT NAME -- names + | GOESWITH -- goes with + | IOBJ -- indirect object + | LIST -- list + | MARK -- marker + | NMOD -- nominal modifier +-- | NMOD POSS -- possessive nominal modifier +-- | NMOD TMOD -- temporal modifier + | NSUBJ -- nominal subject + -- | NSUBJ PASS -- passive nominal subject + | NUMMOD -- numeric modifier + -- | NUMMOD GOV -- numeric modifier governing the case of the noun + | OBJ -- object + | OBL -- oblique nominal +-- | OBL AGENT -- agent modifier +-- | OBL ARG -- oblique argument +-- | OBL LMOD -- locative modifier +-- | OBL TMOD -- temporal modifier + | ORPHAN -- orphan + | PARATAXIS -- parataxis + | PUNCT -- punctuation + | REPARANDUM -- overridden disfluency + | ROOT -- root + | VOCATIVE -- vocative + | XCOMP -- open clausal complement + deriving ( Show, Eq ) + + + -- Structural Categories of the dependent +{- +type StructuralCats = [Nominals , Clauses , ModifierWords , FunctionWords] +type Nominals = [Nsubj , Obj , Iobj , Obl , Vocative , Expl , Dislocated , Nmod , Appos , Nummod] +type Clauses = [Csubj , Ccomp , Xcomp , Advcl , Acl] +type ModifierWords = [Advmod , Discourse , Amod] +type FunctionWords = [Aux , Cop , Mark , Det , Clf , Case] + +-- Functional Categories in relation to the head +type FunctionalCats = [ Core , NonCore , NominalDependent] +type Core = [Nsubj , Obj , Iobj , Csubj , Ccomp , Xcomp] +type NonCore = [Obl , Vocative , Expl , Dislocated , Advcl , Advmod , Discourse , Aux , Cop , Mark] +type NominalDependent = [Nmod , Appos , Nummod , Acl , Amod , Det , Clf , Case] + +-- Relations that are not dependency relations in the narrow sense. +type Coordination = [ Conj , Cc] +type MWE = [ Fixed , Flat , Compound] -- MultiWordExpressions +type Loose = [ List , Parataxis] +type Special = [ Orphan , Goeswith , Reparandum] +type Other = [ Punct , Root , Dep] +-} + + diff --git a/src/Conllu/Diff.hs b/src/Conllu/Diff.hs index 1b9f699..3f38198 100644 --- a/src/Conllu/Diff.hs +++ b/src/Conllu/Diff.hs @@ -1,4 +1,5 @@ --- | +{-# LANGUAGE OverloadedStrings #-} +-- -- | -- Module : Conllu.Diff -- Copyright : © 2018 bruno cuconato -- License : LPGL-3 @@ -23,10 +24,11 @@ import Conllu.Type import Conllu.Utils import Data.Maybe -import Data.Ord +import Data.Ord (comparing) +import Data.Text (unpack) --- --- * type synonims +-- * type synonyms -- | CoNLL-U field diff. type FDiff = StringPair @@ -45,7 +47,7 @@ diffW = any isJust . printFieldDiffs diffWs :: [CW a] -> [CW a] -> [WDiff a] -- | filters the different word pairs. -diffWs ws1 ws2 = filter diffW $ zip ws1 ws2 +diffWs ws1 ws2 = Prelude.filter diffW $ zip ws1 ws2 diffS :: (Sent, Sent) -> SDiff AW -- | diffs the sentence pair's words. @@ -79,8 +81,10 @@ sentId :: Sent -> Maybe Index -- | try to find an index in a sentence's metadata looking for -- 'sent_id = n'. sentId s = - let mi = lookup "sent_id " $ _meta s - i = fromMaybe "0" mi +-- let myHashMap = fromList $ _meta s +-- mi = myHashMap !? "sent_id" + let mi = lookup "sent_id" $ _meta s + i = unpack $ fromMaybe "0" mi in safeRead i :: Maybe Index pairSents :: [Sent] -> [Sent] -> [(Sent, Sent)] @@ -89,7 +93,7 @@ pairSents = pairSentsBy $ comparing sentId --- -- * printing functions -printFieldDiffs :: WDiff a -> [Maybe StringPair] +printFieldDiffs :: WDiff a -> [Maybe (String, String)] -- | list of maybe differing fields in a pair of words. printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs where @@ -110,14 +114,14 @@ printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs , showM . _misc ] -printWDiff :: WDiff a -> [StringPair] +printWDiff :: WDiff a -> [(String, String)] -- | list of differing fields in a pair of words. printWDiff = catMaybes . printFieldDiffs -printSDiff :: SDiff a -> [[StringPair]] +printSDiff :: SDiff a -> [[(String,String)]] -- | list of differing words in a sentence. printSDiff = fmap printWDiff -printDDiff :: DDiff a -> [[[StringPair]]] +printDDiff :: DDiff a -> [[[(String,String)]]] -- | list of lists of differing words in sentences. printDDiff = fmap printSDiff diff --git a/src/Conllu/IO.hs b/src/Conllu/IO.hs index f0bf780..ac5e642 100644 --- a/src/Conllu/IO.hs +++ b/src/Conllu/IO.hs @@ -9,19 +9,30 @@ -- -- Defines major IO functions. -module Conllu.IO where +module Conllu.IO +( + readAndPrintConllu, + diffConllu, + readDirectory, + readConlluFile +) +where --- -- imports -import Conllu.Type -import Conllu.Utils -import Conllu.Parse -import Conllu.Print -import Conllu.Diff +import Conllu.Type (Doc, Sent) +import Conllu.Utils (if') +import Conllu.Parse (Parser, sentence, parseConlluWith) +import Conllu.Print (printDoc) +import Conllu.Diff (diffSs, printDDiff) -import System.Directory -import System.FilePath +import System.Directory (listDirectory, doesFileExist, doesDirectoryExist) --, getDirectoryContents) +import System.FilePath (()) -- Combine two paths with a path separator +-- import Control.Monad (forM) --GVF added for RWH getRecursiveContents at end +import Data.Functor (($>)) +import qualified Data.Text.IO as TIO +-- import qualified Data.Text as T -- * read functions @@ -32,9 +43,9 @@ import System.FilePath readConlluFileWith :: Parser Sent -> FilePath -> IO Doc -- | reads a file with a customized parser. readConlluFileWith p f = do - ds <- readFile f + ds <- TIO.readFile f case parseConlluWith p f ds of - Left err -> putStr err *> return [] + Left err -> putStr err $> [] Right d -> return d readDirectoryWith :: Parser Sent -> FilePath -> IO [Doc] @@ -70,14 +81,14 @@ readConllu = readConlluWith sentence -- * write writeConlluFile :: FilePath -> Doc -> IO () -- | writes a CoNLL-U file to disk. -writeConlluFile fp = writeFile fp . printDoc +writeConlluFile fp = TIO.writeFile fp . printDoc --- -- * print readAndPrintConllu :: FilePath -> IO () -- | reads and prints the CoNLL-U files given. readAndPrintConllu fp = do - readConlluFile fp >>= putStr . printDoc + readConlluFile fp >>= TIO.putStrLn . printDoc return () --- diff --git a/src/Conllu/Parse.hs b/src/Conllu/Parse.hs index 8bfe576..1e10231 100644 --- a/src/Conllu/Parse.hs +++ b/src/Conllu/Parse.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} -- | -- Module : Conllu.Parse -- Copyright : © 2018 bruno cuconato @@ -59,30 +60,52 @@ import Control.Monad (void, liftM2) import Data.Either import Data.Maybe import Data.Void (Void) - +import Data.Char (isAlpha, isAlphaNum) import qualified Text.Megaparsec as TM import Text.Megaparsec.Char import qualified Text.Megaparsec.Char.Lexer as L +import Data.Text (Text) + --- | Parser type synonym -type Parser = TM.Parsec Void String +-- | Parser Type Synonym +type Parser = TM.Parsec Void Text +-- ^ A concrete Parser where the inner monad is Identity +-- the error component is Void and the input stream is Text -- | Parser raw output type RawData t e = [Either (TM.ParseError t e) Sent] +-- ^ see -- | DEPREL field type synonym -type DEPREL = Maybe (D.EP, Maybe String) +type DEPREL = Maybe (D.EP, Maybe Text) --- -- conllu parsers -rawSents :: Parser (RawData String Void) +rawSents :: Parser (RawData Text Void) -- | parse CoNLL-U sentences with recovery. rawSents = rawSentsC sentence -rawSentsC :: Parser Sent -> Parser (RawData String Void) --- | parse CoNLL-U sentences with recovery, using a custom parser. -rawSentsC sent = TM.between ws TM.eof (TM.endBy1 e lineFeed) +rawSentsC :: Parser Sent -> Parser (RawData Text Void) +{- | Parse CoNLL-U sentences withRecovery, using a custom parser. + A sentence block consists of a set of metadata lines or comments ( 1st character is "#"). + See Conllu.Type Comment for more detail + Two comments are required: + A sentence ID regex: "# sent_id\s*\=*\s*(.)*" ID must be unique and not contain a space. + The backslash is reserved + The sentence regex: "(^# text(_ww) = (.)*$)" , where ww is a two or three code language specifier. + WordLines are the block of annotation lines following the metadata block + WordLines, begin with an Index integer starting at 1 for each new sentence, + a range of integers, e.g. or decimal number. See Conllu.Type data ID for more detail. + CoNLL-U annotations consist of 10 fields separated by a tab (\t). + CoNLL-UP (Plus) may contain more than 10 fields + See + A sentence block ends with one line consisting of a unix lineFeed "\n" and no spaces. + The Sent Parser utilizes the megaparsec "withRecovery primitive" to "build a parsing error handler" + for files that do not follow the CoNLL-U format." And to support verification. +-} +rawSentsC sent = + TM.between ws TM.eof (TM.endBy1 e lineFeed) where e = TM.withRecovery recover (Right <$> sent) recover err = @@ -96,6 +119,7 @@ rawSentsC sent = TM.between ws TM.eof (TM.endBy1 e lineFeed) lineFeed :: Parser () -- | parse a blank line. lineFeed = lexeme . void $ newline -- Spaces shouldn't exist, but no problem being lax here + -- TODO spaces should generate a warning sentence :: Parser Sent -- | the default sentence parser. @@ -139,7 +163,7 @@ wordC ixp fop lp upp xpp fsp drp dsp mp = do mdr <- drp <* tab ds <- dsp <* tab mm <- mp <* lineFeed - return $ mkAW i mf ml mup mxp mfs (rel mdh mdr) ds mm + return $ CW i mf ml mup mxp mfs (rel mdh mdr) ds mm where dhp = maybeEmpty ixp TM. "HEAD" rel :: Maybe ID -> DEPREL -> Maybe Rel @@ -152,6 +176,9 @@ emptyField :: Parser (Maybe a) -- | parse an empty field. emptyField = symbol "_" *> return Nothing TM. "empty field (_)" +-- | 'ID' separator in meta words +type IxSep = Char -- TODO This belongs in the Conllu.Parse Module. Only place it is used. + idW :: Parser ID -- | parse the ID field, which might be an integer, a range, or a -- decimal. @@ -178,7 +205,7 @@ idW = do form :: Parser FORM -- | parse the FORM field. -form = orEmpty stringWSpaces TM. "FORM" +form = orEmpty stringWSpaces TM. "FORM" -- TODO allow blank lemma :: Parser LEMMA -- | parse the LEMMA field. @@ -189,7 +216,26 @@ upos :: Parser UPOS upos = maybeEmpty upos' where upos' :: Parser U.POS - upos' = fmap mkUPOS $ TM.choice $ fmap (string' . show) [U.ADJ .. U.X] + upos' = TM.choice + [ U.ADJ <$ string "ADJ" -- adjective + , U.ADP <$ string "ADP" -- adposition + , U.ADV <$ string "ADV" -- adverb + , U.AUX <$ string "AUX" -- auxiliary + , U.CCONJ <$ string "CCONJ" -- coordinating conjunction + , U.DET <$ string "DET" -- determiner + , U.INTJ <$ string "INTJ" -- interjection + , U.NOUN <$ string "NOUN" -- noun + , U.NUM <$ string "NUM" -- numeral + , U.PART <$ string "PART" -- particle + , U.PRON <$ string "PRON" -- pronoun + , U.PROPN <$ string "PROPN" -- proper noun + , U.PUNCT <$ string "PUNCT" -- punctuation + , U.SCONJ <$ string "SCONJ" -- subordinating conjunction + , U.SYM <$ string "SYM" -- symbol + , U.VERB <$ string "VERB" -- verb + , U.X <$ string "X" -- other + ] +{-# INLINE upos #-} xpos :: Parser XPOS -- | parse the XPOS field. @@ -200,28 +246,69 @@ feats :: Parser FEATS feats = listP (feat `TM.sepBy` symbol "|" TM. "FEATS") where feat = do - k <- lexeme (TM.some alphaNumChar TM. "feature key") + k <- lexeme (TM.takeWhileP (Just "feature key") isAlphaNum) -- feature names: [A-Z][A-Za-z0-9]*(\[[a-z0-9]+\])? ft <- TM.optional $ - TM.between (symbol "[") (symbol "]") (TM.some alphaNumChar) + TM.between (symbol "[") (symbol "]") (TM.takeWhileP (Just "Feature") isAlphaNum) --feature-layer between [] _ <- symbol "=" - vs <- fvalue `TM.sepBy1` symbol "," + vs <- fvalue `TM.sepBy1` symbol "," return $ Feat k vs ft - fvalue = lexeme (TM.some alphaNumChar TM. "feature value") + fvalue = lexeme (TM.takeWhileP (Just "feature value") isAlphaNum) -- featurevalues [A-Z0-9][A-Za-z0-9]* deprel :: Parser DEPREL -- | parse the DEPREL field. deprel = maybeEmpty deprel' dep :: Parser D.EP -dep = fmap mkDEP $ TM.choice $ fmap (string' . show) [D.ACL .. D.XCOMP] - -deprel' :: Parser (D.EP, Maybe String) +dep = TM.choice + [ D.REF <$ string "ref" + , D.ACL <$ string "acl" + , D.ADVCL <$ string "advcl" + , D.ADVMOD <$ string "advmod" + , D.AMOD <$ string "amod" + , D.APPOS <$ string "appos" + , D.AUX <$ string "aux" + , D.CASE <$ string "case" + , D.CCOMP <$ string "ccomp" + , D.CC <$ string "cc" + , D.CLF <$ string "clf" + , D.COMPOUND <$ string "compound" + , D.CONJ <$ string "conj" + , D.COP <$ string "cop" + , D.CSUBJ <$ string "csubj" + , D.DEP <$ string "dep" + , D.DET <$ string "det" + , D.DISCOURSE <$ string "discourse" + , D.DISLOCATED <$ string "dislocated" + , D.EXPL <$ string "expl" + , D.FIXED <$ string "fixed" + , D.FLAT <$ string "flat" + , D.GOESWITH <$ string "goeswith" + , D.IOBJ <$ string "iobj" + , D.LIST <$ string "list" + , D.MARK <$ string "mark" + , D.NMOD <$ string "nmod" + , D.NSUBJ <$ string "nsubj" + , D.NUMMOD <$ string "nummod" + , D.OBJ <$ string "obj" + , D.OBJ <$ string "obj" + , D.OBL <$ string "obl" + , D.ORPHAN <$ string "orphan" + , D.PARATAXIS <$ string "parataxis" + , D.PUNCT <$ string "punct" + , D.REPARANDUM <$ string "reparandum" + , D.ROOT <$ string "root" + , D.VOCATIVE <$ string "vocative" + , D.XCOMP <$ string "xcomp" + ] +{-# INLINE dep #-} + +deprel' :: Parser (D.EP, Maybe Text) -- | parse a non-empty DEPREL field. deprel' = liftM2 (,) dep subdeprel where - subdeprel :: Parser (Maybe String) - subdeprel = TM.optional (symbol ":" *> letters TM. "DEPREL subtype") + subdeprel :: Parser (Maybe Text) + subdeprel = TM.optional (symbol ":" *> TM.takeWhile1P (Just "DEPREL subtype") isAlpha ) deps :: Parser DEPS -- | parse the DEPS field. @@ -246,33 +333,36 @@ misc = orEmpty stringWSpaces TM. "MISC" --- -- utility parsers commentPair :: Parser Comment --- | parse a comment pair. +-- | parse a comment pair.stack build + commentPair = keyValue "=" (stringNot "=\n\t") (TM.option "" stringWSpaces) -listPair :: String -> Parser a -> Parser b -> Parser [(a, b)] +listPair :: Text -> Parser a -> Parser b -> Parser [(a, b)] -- | parse a list of pairs. listPair sep p q = keyValue sep p q `TM.sepBy1` symbol "|" -stringNot :: String -> Parser String +stringNot :: String -> Parser Text -- | parse any chars except the ones provided. stringNot s = lexeme $ TM.takeWhile1P Nothing (`notElem` s) -stringWOSpaces :: Parser String +stringWOSpaces :: Parser Text -- | parse a string until a space, a tab, or a newline. stringWOSpaces = stringNot " \t\n" -stringWSpaces :: Parser String +stringWSpaces :: Parser Text -- | parse a string until a tab or a newline. stringWSpaces = stringNot "\t\n" -letters :: Parser String +{- +letters :: Parser [Char] -- | parse a string of letters. letters = lexeme $ TM.some letterChar +-} --- -- parser combinators -keyValue :: String -> Parser a -> Parser b -> Parser (a, b) +keyValue :: Text -> Parser a -> Parser b -> Parser (a, b) -- | parse a (key, value) pair. keyValue sep p q = do key <- p @@ -298,7 +388,7 @@ maybeEmpty :: Parser a -> Parser (Maybe a) -- 'lemma' would). maybeEmpty p = emptyField TM.<|> fmap Just p -orEmpty :: Parser String -> Parser (Maybe String) +orEmpty :: Parser Text -> Parser (Maybe Text) -- | a parser combinator for parsers that may parse "_". orEmpty p = do r <- p @@ -315,7 +405,7 @@ listP p = fromMaybe [] <$> maybeEmpty p --- -- lexing -symbol :: String -> Parser String +symbol :: Text -> Parser Text symbol = L.symbol ws lexeme :: Parser a -> Parser a @@ -324,9 +414,22 @@ lexeme = L.lexeme ws ws :: Parser () ws = void $ TM.takeWhileP (Just "space") (== ' ') ---- +{- | Conllu Format + Annotations are encoded in plain text files (UTF-8, normalized to NFC, + using only the LF character as line break, including an LF character at the end of file) + with three types of lines: + 1) Comment lines starting with hash (#)Comment lines starting with hash (#) + 2) Blank lines marking sentence boundaries. + 3) Word lines containing the annotation of a word/token in 10 fields + separated by single tab characters; as declared below. + See + + Conllu-Plus formats (.conllup) can be supported by creating + data ConlluPlus = {_conllu :: ParserC , _plus :: PlusParser} + see +-} -- customizable parser -data ParserC = ParserC +data ParserC = ParserC -- | this is the ConlluParser ? { _commentP :: Parser Comment , _idP :: Parser ID , _formP :: Parser FORM @@ -334,11 +437,13 @@ data ParserC = ParserC , _upostagP :: Parser UPOS , _xpostagP :: Parser XPOS , _featsP :: Parser FEATS + , _head :: Text -- | added this slot, need to create a head column parser , _deprelP :: Parser DEPREL , _depsP :: Parser DEPS , _miscP :: Parser MISC } deriving () +{- customC :: ParserC customC = ParserC { _commentP = comment @@ -352,6 +457,7 @@ customC = ParserC , _depsP = deps , _miscP = misc } +-} parserC :: ParserC -> Parser Sent -- | defines a custom parser of sentences. if you only need to @@ -379,12 +485,11 @@ parserC p = --- -- parse parseConlluWith - :: Parser Sent -- ^ the sentence parser to be used. - -> FilePath -- ^ the source whose stream is being supplied in the - -- next argument (may be "" for no file) - -> String -- ^ stream to be parsed - -> Either String Doc --- | parse a CoNLL-U document using a customized parser. + :: Parser Sent -- ^ the sentence parser to be used. + -> FilePath -- ^ the source whose stream is being supplied in the next argument (may be "" for no file) + -> Text -- ^ input for parser + -> Either String Doc -- ^ Either (ParseErrorBundle s e) a +-- | parse a CoNLL-U formatted file using a customized parser. parseConlluWith p fp s = case TM.parse doc fp s of Left err -> Left $ TM.errorBundlePretty err @@ -397,6 +502,6 @@ parseConlluWith p fp s = doc = rawSentsC p -parseConllu :: FilePath -> String -> Either String Doc +parseConllu :: FilePath -> Text -> Either String Doc -- | parse a CoNLL-U document using the default parser. parseConllu = parseConlluWith sentence diff --git a/src/Conllu/Print.hs b/src/Conllu/Print.hs index 2b9ee2b..4fe1423 100644 --- a/src/Conllu/Print.hs +++ b/src/Conllu/Print.hs @@ -9,151 +9,119 @@ -- -- prints CoNLL-U. +{-# LANGUAGE OverloadedStrings #-} module Conllu.Print - ( printDoc - , printSent ) where -import qualified Conllu.DeprelTagset as D import Conllu.Type -import Conllu.Utils - -import Data.List -import Data.Maybe -import Data.Semigroup -import Data.Monoid (Monoid(mempty, mappend)) - --- TODO: use some kind of bi-directional thing to derive this module - --- | Functional list type from LYHGG, see HUGHES, RJM. "A novel --- representation of lists and its application to the function --- 'reverse'" -newtype FList a = FList { getFList :: [a] -> [a] } - -instance Semigroup (FList a) where - (FList f) <> (FList g) = FList (f . g) - -instance Monoid (FList a) where - mempty = FList (\xs -> [] ++ xs) - a `mappend` b = a <> b - -toFList :: [a] -> FList a -toFList xs = FList (xs++) - -fromFList :: FList a -> [a] -fromFList (FList f) = f [] +import Data.List (group, transpose ) --, concat) +import Data.Maybe (fromMaybe, fromJust) --, listToMaybe) +import qualified Data.Text as T --- -- printing -printDoc :: Doc -> String --- | prints a list of sentences to a string. -printDoc = - fromFList . mconcat . map (\s -> printSent' s `mappend` diffLSpace) - -printSent :: Sent -> String --- | prints a sentence to a string. -printSent = fromFList . printSent' - -printSent' :: Sent -> FList Char -printSent' ss = - mconcat - [ printComments (_meta ss) - , diffLSpace - , printWs (_words ss) - ] - -printComments :: [Comment] -> FList Char -printComments = - toFList . - intercalate "\n" . - map - (\(c1, c2) -> - concat - [ "# " - , c1 - , if null c2 - then "" - else "= " ++ c2 - ]) - -printWs :: [CW a] -> FList Char -printWs = foldr (\w dl -> mconcat [printW w, diffLSpace, dl]) mempty - -printW :: CW a -> FList Char -printW = printW' - where - printW' :: CW a -> FList Char - printW' w = - wordLine w - [ printID' - , printFORM - , printLEMMA - , printUPOS' - , printXPOS - , printFEATS' - , printHEAD - , printDEPREL' - , printDEPS' - , printMISC - ] - wordLine :: CW a -> [CW a -> String] -> FList Char - wordLine w = toFList . intercalate "\t" . map (\f -> f w) - printID' = printID . _id - printMStr = fromMaybe "_" - printFORM = printMStr . _form - printLEMMA = printMStr . _lemma - printUPOS' = printUPOS . _upos - printXPOS = printMStr . _xpos - printFEATS' = printFEATS . _feats - printHEAD = maybe "_" (printID . _head) . _rel - printDEPREL' = - maybe "_" (\r -> printDEPREL (_deprel r) (_subdep r)) . _rel - printDEPS' = printDEPS . _deps - printMISC = printMStr . _misc - +printDoc :: Doc -> T.Text +-- | prepares a list of sentences for printing: +printDoc doc = T.intercalate "\n" $ T.intercalate "\t" <$> (concat $ printSent <$> doc) + +printSent :: Sent -> [[T.Text]] +-- | prepares a sentence for printing +-- | transpose regroups the sentence fields into a list of word lines +-- Sentences consist of one or more word lines, and word lines contain the fields in printWords +printSent sentence = group (printComment <$> _meta sentence) + <> transpose (printWords $ _words sentence) + +printComment :: Comment -> T.Text +-- | reconstructs a comment line from its parsed representation +-- "# first comment < = second comment>" +-- TODO GVF fix comment parse -- should not include space +printComment comment = "# " + <> fst comment + <> if T.null $ snd comment + then "" + else " = " + <> snd comment + +printWords :: [CW a] -> [[T.Text]] +-- | reconstructs the 10 fields from parsed representation +printWords cw = (<$> cw) <$> + [ printID . _id + , fromMaybe "_" . _form + , fromMaybe "_" . _lemma + , printUPOS . _upos + , printXPOS . _xpos + , printFEATS . _feats + , printHEAD . _rel + , printDEPREL . _rel +-- , printDEPS . _deps + , printMISC . _misc] --- -- field printers -printID :: ID -> String +printID :: ID -> T.Text printID id' = - case id' of - SID i -> show i - MID s e -> concat [show s, "-", show e] - EID i e -> concat [show i, ".", show e] + case id' of + SID id1 -> T.pack $ show id1 + MID id1 id2 -> T.pack $ show id1 <> "-" <> show id2 + EID id1 id2 -> T.pack $ show id1 <> "." <> show id2 -printUPOS :: UPOS -> String -printUPOS Nothing = "_" -printUPOS (Just pos) = show pos +printUPOS :: UPOS -> T.Text +printUPOS upos = T.pack $ maybe "_" show upos -printFEATS :: FEATS -> String +printXPOS :: XPOS -> T.Text +printXPOS xpos = T.toLower . T.pack $ maybe "_" show xpos + +printFEATS :: FEATS -> T.Text printFEATS = printList printFeat where printFeat Feat {_feat = f, _featValues = vs, _featType = mft} = - let fts = maybe "" (\ft -> "[" ++ ft ++ "]") mft - in concat [f, fts, "=", intercalate "," vs] - -printDEPREL :: D.EP -> Maybe String -> String -printDEPREL dr sdr = - downcaseStr $ show dr ++ maybe "" (":" ++) sdr - -printDEPS :: DEPS -> String -printDEPS = - printList - (\r -> - intercalate - ":" - ([printID (_head r), printDEPREL (_deprel r) (_subdep r)] ++ - fromMaybe [] (_rest r))) + let fts = maybe "" (\ft -> "[" <> ft <> "]") mft + in f <> fts <> "=" <> T.intercalate "," vs + +printHEAD :: Maybe Rel -> T.Text +printHEAD Nothing = "_" +printHEAD (Just rel) = printID (_head rel) + +printDEPREL :: Maybe Rel -> T.Text +printDEPREL rel = deprel <> subdep + where + deprel = T.toLower . T.pack $ maybe "_" (show . _deprel) rel + subdep = maybe "" (maybe "" (":" <>) . _subdep) rel + + +{- +printDEPS :: DEPS -> T.Text -- DEPS = [REL] +printDEPS deps = (\x y z -> T.intercalate ":" [x,y,z]) + <$> printHEAD + <*> printDEPREL + <*> (T.intercalate ":" <$> printREST) + <$> listToMaybe . deps +-} + + +{- + (<$> (listToMaybe <$> deps)) <$> [ + printHEAD, printDEPREL, T.intercalate ":" <$> printREST] + +(\x y z -> T.intercalate ":" [x,y,z]) <$> printHEAD <*> printDEPREL <*> (T.intercalate ":" <$> printREST) <$> (listToMaybe <$> (_deps <$> cw)) + +-} + +printREST :: Maybe Rel -> [T.Text] +printREST = fromMaybe [] <$> _rest . fromJust + +printMISC :: MISC -> T.Text +printMISC = T.pack . maybe "" show --- -- utility printers -printList :: (a -> String) -> [a] -> String -printList f = nullToStr . intercalate "|" . map f +printList :: (a -> T.Text) -> [a] -> T.Text +printList f = nullToStr . T.intercalate "|" . map f where - nullToStr :: String -> String + nullToStr :: T.Text -> T.Text nullToStr xs = - if null xs + if T.null xs then "_" else xs -diffLSpace :: FList Char -diffLSpace = toFList "\n" + + diff --git a/src/Conllu/Test.hs b/src/Conllu/Test.hs index e3b276d..b3ef348 100644 --- a/src/Conllu/Test.hs +++ b/src/Conllu/Test.hs @@ -1,6 +1,6 @@ -module Test where +module Conllu.Test where -import Conllu.Type -import Conllu.IO -import Conllu.Parse -import Conllu.Print +import Conllu.Type() +import Conllu.IO() +import Conllu.Parse() +import Conllu.Print() diff --git a/src/Conllu/Type.hs b/src/Conllu/Type.hs index 30e394d..694d7d2 100644 --- a/src/Conllu/Type.hs +++ b/src/Conllu/Type.hs @@ -18,22 +18,42 @@ module Conllu.Type where import Conllu.Utils import qualified Conllu.UposTagset as U import qualified Conllu.DeprelTagset as D +import Data.Text +import Data.Ord (comparing) -import Data.Ord --- -- * type and data declarations -- ** Documents and Sentences type Doc = [Sent] +type Par = [Sent] + +{- | Sentence Boundaries + There must be exactly one blank line after every sentence, including the last sentence in the file. + Empty sentences are not allowed. + Lines starting with the # character and preceding a sentence are considered as carrying comments or metadata + relevant to the following sentence. + TODO add a command line option to ignore comments and or metadata + The contents of the comments and metadata is basically unrestricted, may be application specific + TODO describe how application specific comment and metadata parsing can be accomplished + Two comments are compulsory + 1) A unique sentence id (# sent_id = ). The actual identifier does not contain whitespace characters + (while the comment line may contain whitespace around the sent_id keyword and the equals-to sign) + In sentence ids, the slash character (“/”) is reserved for specialized downstream use. + 2) The unannotated sentence as a single string (# text = ...), (# translit = ...) for transliterations and + # text_en = .... +-} data Sent = Sent - { _meta :: [Comment] -- ^ the sentence's comments. - , _words :: [CW AW] -- ^ the sentence's words. - } deriving (Eq, Show) + { + _meta :: [Comment] -- ^ the sentence's comments. + , + _words :: [CW AW] -- ^ the sentence's words. + } -- deriving (Show) -- , Eq) --- | most comments are (key, value) pairs. +-- | Comments can be (key, value) pairs. #\s[A-Za-z0-9-_]+\s\=\s(.)*$ type Comment = StringPair -type StringPair = (String, String) +type StringPair = (Text, Text) -- TODO perhaps a Data.Map (Key,Text) -- ** Words -- | represents a word line in a CoNLL-U file. note that we have @@ -41,13 +61,15 @@ type StringPair = (String, String) -- combined as a relation type Rel accessible by the '_rel' function; -- the 'DEPS' field is merely a list of 'Rel'. -- --- a C(oNLL-U)W(ord) may be a simple word, a multi-word token, or an --- empty node. this is captured by the phantom type (the `a` in the --- declaration), which can be parametrized by one of the data types --- below in order to build functions that only operate on one of these --- word types (see 'mkSWord' on how to do this). see the '_dep' --- function, which only operates on simple words, which are the ones --- that have a DEPREL field. +-- a C(oNLL-U)W(ord) may be a simple word, a multi-word token, or an +-- empty node. this is captured by the phantom type (the `a` in the +-- declaration), which can be parametrized by one of the data types +-- below in order to build functions that only operate on one of these +-- word types (see 'mkSWord' on how to do this). see the '_dep' +-- function, which only operates on simple words, which are the ones +-- that have a DEPREL field. +{- TODO In the Parse Module there is an equivalent data type for ParserC + perhaps this can be done as a transformation CW -> Parser CW -} data CW a = CW { _id :: ID -- ^ ID field , _form :: FORM -- ^ FORM field @@ -58,82 +80,90 @@ data CW a = CW , _rel :: Maybe Rel -- ^ combined HEAD and DEPREL fields , _deps :: DEPS -- ^ DEPS field , _misc :: MISC -- ^ MISC field - } deriving (Eq, Show) + } deriving (Show, Eq) + +-- | To ensure token ids follow the order described in the Word, Tokens and Empty Nodes +-- the Ord class is used for totally ordered datatypes +-- Conllu Words, Tokens and Empty Nodes instance Ord (CW a) where compare = comparing _id -- *** Word types -- | phantom type for any kind of word. -data AW +data AW -- anyWord -- | phantom type for a simple word. -data SW +data SW -- simpleWord -- | phantom type for multiword tokens. do note that in MWTs only the -- 'ID', 'FORM' and 'MISC' fields may be non-empty. -data MT --- | phantom type for an empty node. -data EN +data MT --multiWord +-- | phantom type for an empty node. i.e. an inferred word not actually in sentence, +-- i.e. Sue likes coffee and Bill (likes) tea, where (likes) is an empty node. +data EN -- Empty Node -- *** Word Fields -data ID -- | Word ID field. +data ID -- | Word ID field. -- TODO tuple?? (Index,Index) = SID Index -- ^ word ID is an integer | MID Index - Index -- ^ multi-word token ID is a range + Index -- ^ multi-word token ID is a range, which may not overlap TODO [Index ... Index] | EID Index - Index -- ^ empty node ID is a decimal + Index -- ^ empty node ID is a decimal, where 5.9 < 5.10 deriving (Eq, Show) +type Index = Int + + instance Ord ID where compare = idOrd where idOrd :: ID -> ID -> Ordering - idOrd (SID x) (SID y) = compare x y + idOrd (SID x) (SID y) = compare x y -- x and y are Int idOrd id1 id2 = let c = comparing fstIx id1 id2 in case c of EQ -> sameIx id1 id2 _ -> c where - fstIx :: ID -> Index - fstIx (SID i) = i - fstIx (MID i _ei) = i - fstIx (EID i _ei) = i + fstIx :: ID -> Index -- Convert an ID into an Index + fstIx (SID i) = i -- SID is an Int + fstIx (MID i _ei) = i -- MID is a range, e.g. 3-4 + fstIx (EID i _ei) = i -- EID is a decimal > 0 . e.g. 0.2, sndIx :: ID -> Index sndIx (EID _s e) = e sndIx (MID _s e) = e + sndIx (SID i ) = i -- there is no sndIX sameIx :: ID -> ID -> Ordering sameIx (SID _) _id = GT sameIx _id (SID _) = LT -- reverse ID order so that MID 1 4 comes before MID 1 2: sameIx i1 i2 = comparing sndIx i2 i1 -type FORM = Maybe String -type LEMMA = Maybe String -type UPOS = Maybe U.POS -type XPOS = Maybe String -type FEATS = [Feat] -type HEAD = ID -type DEPS = [Rel] -type MISC = Maybe String +type FORM = Maybe Text -- ^ Word form or punctuation +type LEMMA = Maybe Text -- ^ Lemma or Stem Word +type UPOS = Maybe U.POS -- ^ Universal Part of Speech +type XPOS = Maybe Text -- ^ language specific part of speech +type FEATS = [Feat] -- ^ language specific features in data.feats.json +type HEAD = ID -- ^ Head of the current word -- tree structure -- root = 0 catena order +type DEPS = [Rel] -- ^ depRel Universal Dependency Relation or language specific subset +type MISC = Maybe Text -- ^ Any other information -- | feature representation +-- Todo evaluate against the Universal Dependency python Verify.FeatureSet data Feat = Feat - { _feat :: String -- ^ feature name - , _featValues :: [String] -- ^ feature values - , _featType :: Maybe String -- ^ feature type (inside brackets). - } deriving (Eq, Show) -- add manual Ord instance? + { _feat :: Text -- ^ feature name + , _featValues :: [Text] -- ^ feature values + , _featType :: Maybe Text -- ^ feature type (inside brackets). + } deriving (Show, Eq ) -- add manual Ord instance? -- | dependency relation representation. data Rel = Rel - { _head :: HEAD -- ^ head 'ID' - , _deprel :: D.EP -- ^ dependency relation type - , _subdep :: Maybe String -- ^ dependency relation subtype - , _rest :: Maybe [String] -- ^ provisitonal, see issues #23,#17 - } deriving (Eq, Show) + { _head :: HEAD -- ^ head 'ID' + , _deprel :: D.EP -- ^ dependency relation type + , _subdep :: Maybe Text -- ^ dependency relation subtype + , _rest :: Maybe [Text] -- ^ provisitonal, see issues #23,#17 + } deriving (Show, Eq) -- Needed for Data CW definition + -type Index = Int --- | 'ID' separator in meta words -type IxSep = Char --- -- ** accessor functions @@ -143,18 +173,21 @@ _dep w = Just . _deprel =<< _rel w depIs :: D.EP -> CW SW -> Bool -- | check if DEP is the one provided. -depIs d = maybe False (d ==) . _dep +depIs d = (Just d ==) . _dep ---- +--- The following are no longer required: +{- -- ** constructor functions -mkDEP :: String -> D.EP +mkDEP :: Text -> Maybe D.EP -- | read a main DEPREL (no subtype). -mkDEP = read . upcaseStr +mkDEP = readMaybe . toString . toUppoer -- upcaseStr' -mkUPOS :: String -> U.POS +mkUPOS :: Text -> Maybe U.POS -- | read an 'UPOS' tag. -mkUPOS = read . upcaseStr +mkUPOS = readMaybe . toString . toUpper -- upcaseStr' +-} +{-- -- words mkAW :: ID -> FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> Maybe Rel -> DEPS -> MISC -> CW AW @@ -173,12 +206,12 @@ mkSW CW { _id = i , _rel = r , _deps = ds , _misc = m - } = CW i f l u x fs r ds m - + = CW i f l u x fs r ds m +--} {-- saved for a future validation module --- -- validation -mTkOK :: FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> (ID, (Dep, Maybe String)) +mTkOK :: FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> (ID, (Dep, Maybe Text)) -> Bool mTkOK fo l up xp fe h dr d = assSomething fo $ @@ -190,4 +223,4 @@ mTkOK fo l up xp fe h dr d = eTkOK :: Dephead -> DepRel -> Deps -> Bool eTkOK h dr d = assNothing h $ assNothing dr $ (assert . not . null $ d) True ---} +--} \ No newline at end of file diff --git a/src/Conllu/UposTagset.hs b/src/Conllu/UposTagset.hs index 74e1445..699af16 100644 --- a/src/Conllu/UposTagset.hs +++ b/src/Conllu/UposTagset.hs @@ -13,24 +13,24 @@ -- import qualified Conllu.UposTagset as U -- @ -module Conllu.UposTagset where +module Conllu.UposTagset where data POS - = ADJ - | ADP - | ADV - | AUX - | CCONJ - | DET - | INTJ - | NOUN - | NUM - | PART - | PRON - | PROPN - | PUNCT - | SCONJ - | SYM - | VERB - | X - deriving (Enum, Eq, Read, Show) + = ADJ -- adjective + | ADP -- adposition + | ADV -- adverb + | AUX -- auxiliary + | CCONJ -- coordinating conjunction + | DET -- determiner + | INTJ -- interjection + | NOUN -- noun + | NUM -- numeral + | PART -- particle + | PRON -- pronoun + | PROPN -- proper noun + | PUNCT -- punctuation + | SCONJ -- subordinating conjunction + | SYM -- symbol + | VERB -- verb + | X -- other + deriving (Enum, Eq, Read, Show ) diff --git a/src/Conllu/Utils.hs b/src/Conllu/Utils.hs index 94750b1..bbb1feb 100644 --- a/src/Conllu/Utils.hs +++ b/src/Conllu/Utils.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} -- | -- Module : Conllu.Utils -- Copyright : © 2018 bruno cuconato @@ -9,22 +10,39 @@ -- -- the library's utility functions. +-- TODO add export List + module Conllu.Utils where -import Control.Exception.Base -import Data.Char -import Data.Maybe + +-- import Control.Exception.Base +-- import Data.Char +import Data.Maybe +-- import qualified Data.Text as T --- -- char functions +-- TODO (map toUpper doesn’t work for Unicode because the result could have a different length) + +{- upcaseStr :: String -> String upcaseStr = map toUpper downcaseStr :: String -> String downcaseStr = map toLower + +upcaseStr' :: Text -> Text +upcaseStr' = toUpper + +downcaseStr' :: Text -> Text +downcaseStr' = toLower +-} + --- -- assertions + +{- assNothing :: Maybe a -> Bool -> Bool assNothing m = assert (isNothing m) @@ -33,6 +51,7 @@ assSomething m = assert (isJust m) assNull :: [a] -> Bool -> Bool assNull l = assert (null l) +-} --- -- function tools @@ -50,7 +69,7 @@ consIf p a as = --- -- safe functions safehead :: [a] -> Maybe a -safehead [] = Nothing +safehead [] = Nothing safehead (x:_) = Just x safeRead :: Read a => String -> Maybe a