From 25383f48ba98d53d48e49a3d09cb3e62dd5e10d6 Mon Sep 17 00:00:00 2001
From: Greg <gregory.v.funaro@funarotech.com>
Date: Sat, 25 Jun 2022 13:29:34 -0500
Subject: [PATCH 1/2] Created an app directory, renamed hs-conllu.cabal
 hs-conllu.cabal.original, added package.yaml, added stack.yaml

---
 app1/Main.hs                                | 30 +++++++++
 hs-conllu.cabal => hs-conllu.cabal.original |  0
 package.yaml                                | 71 +++++++++++++++++++++
 stack.yaml                                  | 67 +++++++++++++++++++
 4 files changed, 168 insertions(+)
 create mode 100644 app1/Main.hs
 rename hs-conllu.cabal => hs-conllu.cabal.original (100%)
 create mode 100644 package.yaml
 create mode 100644 stack.yaml

diff --git a/app1/Main.hs b/app1/Main.hs
new file mode 100644
index 0000000..cb20ef6
--- /dev/null
+++ b/app1/Main.hs
@@ -0,0 +1,30 @@
+-- |
+-- Module      :  Main
+-- Copyright   :  © 2018 bruno cuconato
+-- License     :  LPGL-3
+--
+-- Maintainer  :  bruno cuconato <bcclaro+hackage@gmail.com>
+-- Stability   :  experimental
+-- Portability :  non-portable
+--
+-- the @hs-conllu@ executable.
+
+module Main
+  ( main )
+where
+
+import Conllu.IO (readAndPrintConllu, diffConllu)
+
+import System.Environment
+
+main :: IO ()
+-- | @validate@ : read CoNLL-U file and print it to stdout. (this will
+-- only apply the command to one file, so use your terminal's
+-- completion mechanism to apply it to several files.
+main = do
+  (c:as) <- getArgs
+  case c of
+    "validate" -> mapM_ readAndPrintConllu as
+    "diff"     -> diffConllu (as !! 0) (as !! 1)
+    _ -> return ()
+  
\ No newline at end of file
diff --git a/hs-conllu.cabal b/hs-conllu.cabal.original
similarity index 100%
rename from hs-conllu.cabal
rename to hs-conllu.cabal.original
diff --git a/package.yaml b/package.yaml
new file mode 100644
index 0000000..50563bd
--- /dev/null
+++ b/package.yaml
@@ -0,0 +1,71 @@
+name:                hs-conllu
+version:             0.1.6
+synopsis:            Conllu validating parser and utils.
+description:         utilities to parse, print, diff, and analyse data in CoNLL-U format.
+homepage:            https://github.com/odanoburu/hs-conllu
+bug-reports:         https://github.com/odanoburu/hs-conllu/issues
+license:             LGPL-3.0-only
+license-file:        LICENSE
+author:              bruno cuconato
+maintainer:          bruno cuconato <bcclaro+haskell+hsconllu@gmail.com>
+copyright:           2021 bruno cuconato
+build-type:          Simple
+extra-doc-files:
+  - README
+  - CHANGELOG
+tested-with:   ghc == 9.2.2
+
+verbatim:
+  cabal-version: 2.0
+
+ghc-options:
+  - -Wall
+  - -Wcompat
+  - -Widentities
+  - -Wincomplete-uni-patterns
+  - -Wincomplete-record-updates
+
+default-extensions:
+  - OverloadedStrings
+  - LambdaCase
+#  - NoImplicitPrelude
+
+dependencies:
+  - aeson
+  - base >= 4.15 && < 5
+  - bytestring
+  - containers
+  - directory
+  - filepath
+  - megaparsec
+  - mtl
+  - optparse-applicative
+  - scientific
+  - string-interpolate
+  - text
+  - text-conversions
+  - text-show
+  - unordered-containers
+  - vector
+
+library:
+  source-dirs:   src
+  other-modules: []
+  exposed-modules:
+  - Conllu.DeprelTagset
+  - Conllu.Diff
+  - Conllu.IO
+  - Conllu.Parse
+  - Conllu.Print
+  - Conllu.Test
+  - Conllu.Type
+  - Conllu.UposTagset
+  - Conllu.Utils
+
+executables:
+  hs-conllu:
+    main:          Main.hs
+    source-dirs:   app1
+    dependencies:
+      - hs-conllu
+
diff --git a/stack.yaml b/stack.yaml
new file mode 100644
index 0000000..e0f50e8
--- /dev/null
+++ b/stack.yaml
@@ -0,0 +1,67 @@
+# This file was automatically generated by 'stack init'
+#
+# Some commonly used options have been documented as comments in this file.
+# For advanced use and comprehensive documentation of the format, please see:
+# https://docs.haskellstack.org/en/stable/yaml_configuration/
+
+# Resolver to choose a 'specific' stackage snapshot or a compiler version.
+# A snapshot resolver dictates the compiler version and the set of packages
+# to be used for project dependencies. For example:
+#
+# resolver: lts-3.5
+# resolver: nightly-2015-09-21
+# resolver: ghc-7.10.2
+#
+# The location of a snapshot can be provided as a file or url. Stack assumes
+# a snapshot provided as a file might change, whereas a url resource does not.
+#
+# resolver: ./custom-snapshot.yaml
+# resolver: https://example.com/snapshots/2018-01-01.yaml
+resolver: nightly-2022-01-04
+      # url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/16/31.yaml
+
+# User packages to be built.
+# Various formats can be used as shown in the example below.
+#
+# packages:
+# - some-directory
+# - https://example.com/foo/bar/baz-0.0.2.tar.gz
+#   subdirs:
+#   - auto-update
+#   - wai
+packages:
+- .
+# Dependency packages to be pulled from upstream that are not in the resolver.
+# These entries can reference officially published versions as well as
+# forks / in-progress versions pinned to a git hash. For example:
+#
+# extra-deps:
+# - acme-missiles-0.3
+# - git: https://github.com/commercialhaskell/stack.git
+#   commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
+#
+# extra-deps: []
+
+# Override default flag values for local packages and extra-deps
+# flags: {}
+
+# Extra package databases containing global packages
+# extra-package-dbs: []
+
+# Control whether we use the GHC we find on the path
+# system-ghc: true
+#
+# Require a specific version of stack, using version ranges
+# require-stack-version: -any # Default
+# require-stack-version: ">=2.7"
+#
+# Override the architecture used by stack, especially useful on Windows
+# arch: i386
+# arch: x86_64
+#
+# Extra directories used by stack for building
+# extra-include-dirs: [/path/to/dir]
+# extra-lib-dirs: [/path/to/dir]
+#
+# Allow a newer minor version of GHC than the snapshot specifies
+# compiler-check: newer-minor

From 02f2db6bf3fa0bd54ecd5b817b02b5dc66d0aedb Mon Sep 17 00:00:00 2001
From: Greg <gregory.v.funaro@funarotech.com>
Date: Sat, 25 Jun 2022 13:30:54 -0500
Subject: [PATCH 2/2] Use text instead of string

---
 src/Conllu/DeprelTagset.hs | 108 +++++++++++++++++-
 src/Conllu/Diff.hs         |  24 ++--
 src/Conllu/IO.hs           |  35 ++++--
 src/Conllu/Parse.hs        | 181 +++++++++++++++++++++++-------
 src/Conllu/Print.hs        | 218 ++++++++++++++++---------------------
 src/Conllu/Test.hs         |  10 +-
 src/Conllu/Type.hs         | 147 +++++++++++++++----------
 src/Conllu/UposTagset.hs   |  38 +++----
 src/Conllu/Utils.hs        |  27 ++++-
 9 files changed, 516 insertions(+), 272 deletions(-)

diff --git a/src/Conllu/DeprelTagset.hs b/src/Conllu/DeprelTagset.hs
index 6a2b6d8..bcd107f 100644
--- a/src/Conllu/DeprelTagset.hs
+++ b/src/Conllu/DeprelTagset.hs
@@ -13,8 +13,14 @@
 -- import qualified Conllu.DeprelTagset as D
 -- @
 
-module Conllu.DeprelTagset where
+-- TODO create the UD Taxonomy Table and enhanced dependencies
+-- < see https://universaldependencies.org/v2/conll-u.html>
+-- TODO include modifier labels, e.g. passive verbs, relative cluses, etc.
+-- TODO resolve deriving strategy
 
+module Conllu.DeprelTagset
+where  
+{-  original declaration:
 data EP
   = REF -- ^ only allowed in DEPS
   | ACL
@@ -54,4 +60,102 @@ data EP
   | ROOT
   | VOCATIVE
   | XCOMP
-  deriving (Enum, Eq, Read, Show)
+  deriving ( Show, Eq )  -- , Read,Enum ) It is not clear why Enum and Eq are required 
+-}
+
+-- Revised Universal Dependency Relations
+-- TODO include subtyped relations and Structural Categories of the dependent
+-- Universal Dependencies are represented lowercase
+-- see https://universaldependencies.org/u/dep/index.html 
+
+data EP = -- DEPREL  =
+        REF        -- ^ only allowed in DEPS
+      |  ACL        -- clausal modifier of noun (adnominal clause)
+--      | ACL RELCL           -- relative clause modifier
+      | ADVCL      -- adverbial clause modifier
+      | ADVMOD     -- adverbial modifier
+--      | ADVMOD EMPH         -- emphasizing word, intensifier
+--      | ADVMOD LMOD         -- locative adverbial modifier
+      | AMOD       -- adjectival modifier
+      | APPOS      -- appositional modifier
+      | AUX        -- auxiliary
+--      | AUX PASS   -- passive auxiliary
+      | CASE       -- case marking
+      | CC         -- coordinating conjunction
+--      | CC PRECONJ -- preconjunct
+      | CCOMP      -- clausal complement
+      | CLF        -- classifier
+      | COMPOUND   -- compound
+--      | COMPOUND LVC        -- light verb construction
+--      | COMPOUND PRT        -- phrasal verb particle
+--      | COMPOUND REDUP      -- reduplicated compounds
+--      | COMPOUND SVC        -- serial verb compounds
+      | CONJ       -- conjunct
+      | COP        -- copula
+      | CSUBJ      -- clausal subject
+ --     | CSUBJ PASS          -- clausal passive subject
+      | DEP        -- unspecified dependency
+      | DET        -- determiner
+--      | DET NUMGOV          -- pronominal quantifier governing the case of the noun
+--      | DET NUMMOD          -- pronominal quantifier agreeing in case with the noun
+--      | DET POSS            -- possessive determiner
+      | DISCOURSE  -- discourse element
+      | DISLOCATED -- dislocated elements
+      | EXPL       -- expletive
+--      | EXPL IMPERS         -- impersonal expletive
+--      | EXPL PASS           -- reflexive pronoun used in reflexive passive
+--      | EXPL PV             -- reflexive clitic with an inherently reflexive verb
+      | FIXED      -- fixed multiword expression
+      | FLAT       -- flat multiword expression
+--      | FLAT FOREIGN        -- foreign words
+--      | FLAT NAME           -- names
+      | GOESWITH   -- goes with
+      | IOBJ       -- indirect object
+      | LIST       -- list
+      | MARK       -- marker
+      | NMOD       -- nominal modifier
+--      | NMOD POSS           -- possessive nominal modifier
+--      | NMOD TMOD           -- temporal modifier
+      | NSUBJ      -- nominal subject
+ --     | NSUBJ PASS          -- passive nominal subject
+      | NUMMOD     -- numeric modifier
+ --     | NUMMOD GOV          -- numeric modifier governing the case of the noun
+      | OBJ        -- object
+      | OBL        -- oblique nominal
+--      | OBL AGENT           -- agent modifier
+--      | OBL ARG             -- oblique argument
+--      | OBL LMOD            -- locative modifier
+--      | OBL TMOD            -- temporal modifier
+      | ORPHAN     -- orphan
+      | PARATAXIS  -- parataxis
+      | PUNCT      -- punctuation
+      | REPARANDUM -- overridden disfluency
+      | ROOT       -- root
+      | VOCATIVE   -- vocative
+      | XCOMP      -- open clausal complement
+      deriving ( Show, Eq )
+      
+      
+  --  Structural Categories of the dependent
+{-
+type StructuralCats = [Nominals , Clauses , ModifierWords , FunctionWords]
+type Nominals       = [Nsubj , Obj , Iobj , Obl , Vocative , Expl , Dislocated , Nmod , Appos , Nummod]
+type Clauses        = [Csubj , Ccomp , Xcomp , Advcl , Acl]
+type ModifierWords  = [Advmod , Discourse , Amod]
+type FunctionWords  = [Aux , Cop , Mark , Det , Clf , Case]
+
+-- Functional Categories in relation to the head
+type FunctionalCats   = [ Core , NonCore , NominalDependent]
+type Core             = [Nsubj , Obj , Iobj , Csubj , Ccomp , Xcomp]
+type NonCore          = [Obl , Vocative , Expl , Dislocated , Advcl , Advmod , Discourse , Aux , Cop , Mark]
+type NominalDependent = [Nmod , Appos , Nummod , Acl , Amod , Det , Clf , Case]
+
+-- Relations that are not dependency relations in the narrow sense.
+type Coordination     = [ Conj , Cc]
+type MWE              = [ Fixed , Flat , Compound] -- MultiWordExpressions
+type Loose            = [ List , Parataxis]
+type Special          = [ Orphan , Goeswith , Reparandum]
+type Other            = [ Punct , Root , Dep]
+-}
+
+
diff --git a/src/Conllu/Diff.hs b/src/Conllu/Diff.hs
index 1b9f699..3f38198 100644
--- a/src/Conllu/Diff.hs
+++ b/src/Conllu/Diff.hs
@@ -1,4 +1,5 @@
--- |
+{-# LANGUAGE OverloadedStrings #-}
+-- -- |
 -- Module      :  Conllu.Diff
 -- Copyright   :  © 2018 bruno cuconato
 -- License     :  LPGL-3
@@ -23,10 +24,11 @@ import Conllu.Type
 import Conllu.Utils
 
 import Data.Maybe
-import Data.Ord
+import Data.Ord (comparing)
+import Data.Text (unpack) 
 
 ---
--- * type synonims
+-- * type synonyms
 -- | CoNLL-U field diff.
 type FDiff = StringPair
 
@@ -45,7 +47,7 @@ diffW = any isJust . printFieldDiffs
 
 diffWs :: [CW a] -> [CW a] -> [WDiff a]
 -- | filters the different word pairs.
-diffWs ws1 ws2 = filter diffW $ zip ws1 ws2
+diffWs ws1 ws2 = Prelude.filter diffW $ zip ws1 ws2
 
 diffS :: (Sent, Sent) -> SDiff AW
 -- | diffs the sentence pair's words.
@@ -79,8 +81,10 @@ sentId :: Sent -> Maybe Index
 -- | try to find an index in a sentence's metadata looking for
 -- 'sent_id = n'.
 sentId s =
-  let mi = lookup "sent_id " $ _meta s
-      i = fromMaybe "0" mi
+--  let myHashMap = fromList $ _meta s
+--      mi = myHashMap !? "sent_id"
+  let mi = lookup "sent_id" $ _meta s
+      i = unpack $ fromMaybe "0" mi
   in safeRead i :: Maybe Index
 
 pairSents :: [Sent] -> [Sent] -> [(Sent, Sent)]
@@ -89,7 +93,7 @@ pairSents = pairSentsBy $ comparing sentId
 
 ---
 -- * printing functions
-printFieldDiffs :: WDiff a -> [Maybe StringPair]
+printFieldDiffs :: WDiff a -> [Maybe (String, String)]
 -- | list of maybe differing fields in a pair of words.
 printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs
   where
@@ -110,14 +114,14 @@ printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs
       , showM . _misc
       ]
 
-printWDiff :: WDiff a -> [StringPair]
+printWDiff :: WDiff a -> [(String, String)]
 -- | list of differing fields in a pair of words.
 printWDiff = catMaybes . printFieldDiffs
 
-printSDiff :: SDiff a -> [[StringPair]]
+printSDiff :: SDiff a -> [[(String,String)]]
 -- | list of differing words in a sentence.
 printSDiff = fmap printWDiff
 
-printDDiff :: DDiff a -> [[[StringPair]]]
+printDDiff :: DDiff a -> [[[(String,String)]]]
 -- | list of lists of differing words in sentences.
 printDDiff = fmap printSDiff
diff --git a/src/Conllu/IO.hs b/src/Conllu/IO.hs
index f0bf780..ac5e642 100644
--- a/src/Conllu/IO.hs
+++ b/src/Conllu/IO.hs
@@ -9,19 +9,30 @@
 --
 -- Defines major IO functions.
 
-module Conllu.IO where
+module Conllu.IO
+(
+  readAndPrintConllu,
+  diffConllu,
+  readDirectory,
+  readConlluFile
+)
+where
 
 ---
 -- imports
-import Conllu.Type
-import Conllu.Utils
-import Conllu.Parse
-import Conllu.Print
-import Conllu.Diff
+import Conllu.Type (Doc, Sent)
+import Conllu.Utils (if')
+import Conllu.Parse (Parser, sentence, parseConlluWith)
+import Conllu.Print (printDoc)
+import Conllu.Diff (diffSs, printDDiff)
 
-import System.Directory
-import System.FilePath
+import System.Directory (listDirectory, doesFileExist, doesDirectoryExist) --, getDirectoryContents)
+import System.FilePath ((</>))  -- Combine two paths with a path separator
 
+-- import Control.Monad (forM) --GVF added for RWH getRecursiveContents at end
+import Data.Functor (($>))
+import qualified Data.Text.IO as TIO
+-- import qualified Data.Text as T
 
 -- * read functions
 
@@ -32,9 +43,9 @@ import System.FilePath
 readConlluFileWith :: Parser Sent -> FilePath -> IO Doc
 -- | reads a file with a customized parser.
 readConlluFileWith p f = do
-  ds <- readFile f
+  ds <- TIO.readFile f
   case parseConlluWith p f ds of
-    Left err -> putStr err *> return []
+    Left err -> putStr err $> []
     Right d -> return d
 
 readDirectoryWith :: Parser Sent -> FilePath -> IO [Doc]
@@ -70,14 +81,14 @@ readConllu = readConlluWith sentence
 -- * write
 writeConlluFile :: FilePath -> Doc -> IO ()
 -- | writes a CoNLL-U file to disk.
-writeConlluFile fp = writeFile fp . printDoc
+writeConlluFile fp = TIO.writeFile fp . printDoc
 
 ---
 -- * print
 readAndPrintConllu :: FilePath -> IO ()
 -- | reads and prints the CoNLL-U files given.
 readAndPrintConllu fp = do
-  readConlluFile fp >>= putStr . printDoc
+  readConlluFile fp >>= TIO.putStrLn . printDoc
   return ()
 
 ---
diff --git a/src/Conllu/Parse.hs b/src/Conllu/Parse.hs
index 8bfe576..1e10231 100644
--- a/src/Conllu/Parse.hs
+++ b/src/Conllu/Parse.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
 -- |
 -- Module      :  Conllu.Parse
 -- Copyright   :  © 2018 bruno cuconato
@@ -59,30 +60,52 @@ import           Control.Monad (void, liftM2)
 import           Data.Either
 import           Data.Maybe
 import           Data.Void (Void)
-
+import           Data.Char (isAlpha, isAlphaNum)    
 import qualified Text.Megaparsec as TM
 import           Text.Megaparsec.Char
 import qualified Text.Megaparsec.Char.Lexer as L
+import           Data.Text (Text)
+
 
--- | Parser type synonym
-type Parser = TM.Parsec Void String
+-- | Parser Type Synonym
+type Parser = TM.Parsec Void Text
+-- ^ A concrete Parser where the inner monad is Identity
+--   the error component is Void and the input stream is Text
 
 -- | Parser raw output
 type RawData t e = [Either (TM.ParseError t e) Sent]
+-- ^ see <https://markkarpov.com/tutorial/megaparsec.html#parse-error-definitions>
 
 -- | DEPREL field type synonym
-type DEPREL = Maybe (D.EP, Maybe String)
+type DEPREL = Maybe (D.EP, Maybe Text)
 
 
 ---
 -- conllu parsers
-rawSents :: Parser (RawData String Void)
+rawSents :: Parser (RawData Text Void)
 -- | parse CoNLL-U sentences with recovery.
 rawSents = rawSentsC sentence
 
-rawSentsC :: Parser Sent -> Parser (RawData String Void)
--- | parse CoNLL-U sentences with recovery, using a custom parser.
-rawSentsC sent = TM.between ws TM.eof (TM.endBy1 e lineFeed)
+rawSentsC :: Parser Sent -> Parser (RawData Text Void)
+{- | Parse CoNLL-U sentences withRecovery, using a custom parser.
+       A sentence block consists of a set of metadata lines or comments ( 1st character is "#").
+          See Conllu.Type Comment for more detail
+          Two comments are required:
+             A sentence ID regex: "# sent_id\s*\=*\s*(.)*" ID must be unique and not contain a space.
+             The backslash is reserved
+             The sentence  regex: "(^# text(_ww) = (.)*$)" , where ww is a two or three code language specifier.
+       WordLines are the block of annotation lines following the metadata block
+       WordLines, begin with an Index integer starting at 1 for each new sentence,
+            a range of integers, e.g. or decimal number. See Conllu.Type data ID for more detail.
+       CoNLL-U annotations consist of 10 fields separated by a tab (\t).
+       CoNLL-UP (Plus) may contain more than 10 fields
+         See <https://universaldependencies.org/ext-format.html>
+       A sentence block ends with one line consisting of a unix lineFeed "\n" and no spaces.
+       The Sent Parser utilizes the megaparsec "withRecovery primitive" to "build a parsing error handler"
+         for files that do not follow the CoNLL-U format."  And to support verification.
+-}
+rawSentsC sent =
+  TM.between ws TM.eof (TM.endBy1 e lineFeed)
   where
     e = TM.withRecovery recover (Right <$> sent)
     recover err =
@@ -96,6 +119,7 @@ rawSentsC sent = TM.between ws TM.eof (TM.endBy1 e lineFeed)
 lineFeed :: Parser ()
 -- | parse a blank line.
 lineFeed = lexeme . void $ newline -- Spaces shouldn't exist, but no problem being lax here
+                                   -- TODO spaces should generate a warning
 
 sentence :: Parser Sent
 -- | the default sentence parser.
@@ -139,7 +163,7 @@ wordC ixp fop lp upp xpp fsp drp dsp mp = do
   mdr <- drp <* tab
   ds  <- dsp <* tab
   mm  <- mp  <* lineFeed
-  return $ mkAW i mf ml mup mxp mfs (rel mdh mdr) ds mm
+  return $ CW i mf ml mup mxp mfs (rel mdh mdr) ds mm
   where
     dhp = maybeEmpty ixp TM.<?> "HEAD"
     rel :: Maybe ID -> DEPREL -> Maybe Rel
@@ -152,6 +176,9 @@ emptyField :: Parser (Maybe a)
 -- | parse an empty field.
 emptyField = symbol "_" *> return Nothing TM.<?> "empty field (_)"
 
+-- | 'ID' separator in meta words
+type IxSep   = Char   -- TODO This belongs in the Conllu.Parse Module. Only place it is used.
+
 idW :: Parser ID
 -- | parse the ID field, which might be an integer, a range, or a
 -- decimal.
@@ -178,7 +205,7 @@ idW = do
 
 form :: Parser FORM
 -- | parse the FORM field.
-form = orEmpty stringWSpaces TM.<?> "FORM"
+form = orEmpty stringWSpaces TM.<?> "FORM"  -- TODO allow blank
 
 lemma :: Parser LEMMA
 -- | parse the LEMMA field.
@@ -189,7 +216,26 @@ upos :: Parser UPOS
 upos = maybeEmpty upos'
   where
     upos' :: Parser U.POS
-    upos' = fmap mkUPOS $ TM.choice $ fmap (string' . show) [U.ADJ .. U.X]
+    upos' = TM.choice
+       [ U.ADJ   <$ string  "ADJ"     --  adjective
+       , U.ADP   <$ string  "ADP"     --  adposition
+       , U.ADV   <$ string  "ADV"     --  adverb
+       , U.AUX   <$ string  "AUX"     --  auxiliary
+       , U.CCONJ <$ string  "CCONJ"   --  coordinating conjunction
+       , U.DET   <$ string  "DET"     --  determiner
+       , U.INTJ  <$ string  "INTJ"    --  interjection
+       , U.NOUN  <$ string  "NOUN"    --  noun
+       , U.NUM   <$ string  "NUM"    --  numeral
+       , U.PART  <$ string  "PART"    --  particle
+       , U.PRON  <$ string  "PRON"    --  pronoun
+       , U.PROPN <$ string  "PROPN"   --  proper noun
+       , U.PUNCT <$ string  "PUNCT"   --  punctuation
+       , U.SCONJ <$ string  "SCONJ"   --  subordinating conjunction
+       , U.SYM   <$ string  "SYM"     --  symbol
+       , U.VERB  <$ string  "VERB"    --  verb
+       , U.X     <$ string  "X"       --  other 
+       ]
+{-# INLINE upos #-}
 
 xpos :: Parser XPOS
 -- | parse the XPOS field.
@@ -200,28 +246,69 @@ feats :: Parser FEATS
 feats = listP (feat `TM.sepBy` symbol "|" TM.<?> "FEATS")
   where
     feat = do
-      k  <- lexeme (TM.some alphaNumChar TM.<?> "feature key")
+      k <- lexeme (TM.takeWhileP (Just "feature key") isAlphaNum) -- feature names: [A-Z][A-Za-z0-9]*(\[[a-z0-9]+\])?
       ft <-
         TM.optional $
-        TM.between (symbol "[") (symbol "]") (TM.some alphaNumChar)
+        TM.between (symbol "[") (symbol "]") (TM.takeWhileP (Just "Feature") isAlphaNum) --feature-layer between []
       _  <- symbol "="
-      vs <- fvalue `TM.sepBy1` symbol ","
+      vs <- fvalue `TM.sepBy1` symbol ","  
       return $ Feat k vs ft
-    fvalue = lexeme (TM.some alphaNumChar TM.<?> "feature value")
+    fvalue = lexeme (TM.takeWhileP (Just "feature value") isAlphaNum)   -- featurevalues [A-Z0-9][A-Za-z0-9]*
 
 deprel :: Parser DEPREL
 -- | parse the DEPREL field.
 deprel = maybeEmpty deprel'
 
 dep :: Parser D.EP
-dep = fmap mkDEP $ TM.choice $ fmap (string' . show) [D.ACL .. D.XCOMP]
-
-deprel' :: Parser (D.EP, Maybe String)
+dep = TM.choice
+  [ D.REF        <$ string  "ref"
+  , D.ACL        <$ string  "acl"
+  , D.ADVCL      <$ string  "advcl"
+  , D.ADVMOD     <$ string  "advmod"
+  , D.AMOD       <$ string  "amod"
+  , D.APPOS      <$ string  "appos"
+  , D.AUX        <$ string  "aux"
+  , D.CASE       <$ string  "case"
+  , D.CCOMP      <$ string  "ccomp"
+  , D.CC         <$ string  "cc"
+  , D.CLF        <$ string  "clf"
+  , D.COMPOUND   <$ string  "compound"
+  , D.CONJ       <$ string  "conj"
+  , D.COP        <$ string  "cop"
+  , D.CSUBJ      <$ string  "csubj"
+  , D.DEP        <$ string  "dep"
+  , D.DET        <$ string  "det"
+  , D.DISCOURSE  <$ string  "discourse"
+  , D.DISLOCATED <$ string  "dislocated"
+  , D.EXPL       <$ string  "expl"
+  , D.FIXED      <$ string  "fixed"
+  , D.FLAT       <$ string  "flat"
+  , D.GOESWITH   <$ string  "goeswith"
+  , D.IOBJ       <$ string  "iobj"
+  , D.LIST       <$ string  "list"
+  , D.MARK       <$ string  "mark"
+  , D.NMOD       <$ string  "nmod"
+  , D.NSUBJ      <$ string  "nsubj"
+  , D.NUMMOD     <$ string  "nummod"
+  , D.OBJ        <$ string  "obj"
+  , D.OBJ        <$ string  "obj"
+  , D.OBL        <$ string  "obl"
+  , D.ORPHAN     <$ string  "orphan"
+  , D.PARATAXIS  <$ string  "parataxis"
+  , D.PUNCT      <$ string  "punct"
+  , D.REPARANDUM <$ string  "reparandum"
+  , D.ROOT       <$ string  "root"
+  , D.VOCATIVE   <$ string  "vocative"
+  , D.XCOMP      <$ string  "xcomp"
+  ]
+{-# INLINE dep #-}
+
+deprel' :: Parser (D.EP, Maybe Text)
 -- | parse a non-empty DEPREL field.
 deprel' = liftM2 (,) dep subdeprel
   where
-    subdeprel :: Parser (Maybe String)
-    subdeprel = TM.optional (symbol ":" *> letters TM.<?> "DEPREL subtype")
+    subdeprel :: Parser (Maybe Text)
+    subdeprel = TM.optional (symbol ":" *> TM.takeWhile1P (Just "DEPREL subtype") isAlpha )
 
 deps :: Parser DEPS
 -- | parse the DEPS field.
@@ -246,33 +333,36 @@ misc = orEmpty stringWSpaces TM.<?> "MISC"
 ---
 -- utility parsers
 commentPair :: Parser Comment
--- | parse a comment pair.
+-- | parse a comment pair.stack build
+
 commentPair =
   keyValue "=" (stringNot "=\n\t") (TM.option "" stringWSpaces)
 
-listPair :: String -> Parser a -> Parser b -> Parser [(a, b)]
+listPair :: Text -> Parser a -> Parser b -> Parser [(a, b)]
 -- | parse a list of pairs.
 listPair sep p q = keyValue sep p q `TM.sepBy1` symbol "|"
 
-stringNot :: String -> Parser String
+stringNot :: String -> Parser Text
 -- | parse any chars except the ones provided.
 stringNot s = lexeme $ TM.takeWhile1P Nothing (`notElem` s)
 
-stringWOSpaces :: Parser String
+stringWOSpaces :: Parser Text
 -- | parse a string until a space, a tab, or a newline.
 stringWOSpaces = stringNot " \t\n"
 
-stringWSpaces :: Parser String
+stringWSpaces :: Parser Text
 -- | parse a string until a tab or a newline.
 stringWSpaces = stringNot "\t\n"
 
-letters :: Parser String
+{-
+letters :: Parser [Char]
 -- | parse a string of letters.
 letters = lexeme $ TM.some letterChar
+-}
 
 ---
 -- parser combinators
-keyValue :: String -> Parser a -> Parser b -> Parser (a, b)
+keyValue :: Text -> Parser a -> Parser b -> Parser (a, b)
 -- | parse a (key, value) pair.
 keyValue sep p q = do
   key   <- p
@@ -298,7 +388,7 @@ maybeEmpty :: Parser a -> Parser (Maybe a)
 -- 'lemma' would).
 maybeEmpty p = emptyField TM.<|> fmap Just p
 
-orEmpty :: Parser String -> Parser (Maybe String)
+orEmpty :: Parser Text -> Parser (Maybe Text)
 -- | a parser combinator for parsers that may parse "_".
 orEmpty p = do
   r <- p
@@ -315,7 +405,7 @@ listP p = fromMaybe [] <$> maybeEmpty p
 
 ---
 -- lexing
-symbol :: String -> Parser String
+symbol :: Text -> Parser Text
 symbol = L.symbol ws
 
 lexeme :: Parser a -> Parser a
@@ -324,9 +414,22 @@ lexeme = L.lexeme ws
 ws :: Parser ()
 ws = void $ TM.takeWhileP (Just "space") (== ' ')
 
----
+{- | Conllu Format
+      Annotations are encoded in plain text files (UTF-8, normalized to NFC,
+      using only the LF character as line break, including an LF character at the end of file)
+      with three types of lines:
+      1) Comment lines starting with hash (#)Comment lines starting with hash (#)
+      2) Blank lines marking sentence boundaries.
+      3) Word lines containing the annotation of a word/token in 10 fields
+         separated by single tab characters; as declared below.
+      See <https://universaldependencies.org/ext-format.html>
+
+    Conllu-Plus formats (.conllup) can be supported by creating
+    data ConlluPlus = {_conllu :: ParserC , _plus :: PlusParser}
+    see <https://universaldependencies.org/ext-format.html>
+-}
 -- customizable parser
-data ParserC = ParserC
+data ParserC = ParserC          -- | this is the ConlluParser ?
   { _commentP :: Parser Comment
   , _idP      :: Parser ID
   , _formP    :: Parser FORM
@@ -334,11 +437,13 @@ data ParserC = ParserC
   , _upostagP :: Parser UPOS
   , _xpostagP :: Parser XPOS
   , _featsP   :: Parser FEATS
+  , _head     :: Text         -- | added this slot, need to create a head column parser
   , _deprelP  :: Parser DEPREL
   , _depsP    :: Parser DEPS
   , _miscP    :: Parser MISC
   } deriving ()
 
+{-
 customC :: ParserC
 customC = ParserC
   { _commentP = comment
@@ -352,6 +457,7 @@ customC = ParserC
   , _depsP    = deps
   , _miscP    = misc
   }
+-}
 
 parserC :: ParserC -> Parser Sent
 -- | defines a custom parser of sentences. if you only need to
@@ -379,12 +485,11 @@ parserC p =
 ---
 -- parse
 parseConlluWith
-  :: Parser Sent -- ^ the sentence parser to be used.
-  -> FilePath    -- ^ the source whose stream is being supplied in the
-                 -- next argument (may be "" for no file)
-  -> String      -- ^ stream to be parsed
-  -> Either String Doc
--- | parse a CoNLL-U document using a customized parser.
+  :: Parser Sent        -- ^ the sentence parser to be used.
+  -> FilePath           -- ^ the source whose stream is being supplied in the  next argument (may be "" for no file)
+  -> Text               -- ^ input for parser
+  -> Either String Doc  -- ^ Either (ParseErrorBundle s e) a
+-- | parse a CoNLL-U formatted file using a customized parser.
 parseConlluWith p fp s =
   case TM.parse doc fp s of
     Left err -> Left $ TM.errorBundlePretty err
@@ -397,6 +502,6 @@ parseConlluWith p fp s =
     doc = rawSentsC p
 
 
-parseConllu :: FilePath -> String -> Either String Doc
+parseConllu :: FilePath -> Text -> Either String Doc
 -- | parse a CoNLL-U document using the default parser.
 parseConllu = parseConlluWith sentence
diff --git a/src/Conllu/Print.hs b/src/Conllu/Print.hs
index 2b9ee2b..4fe1423 100644
--- a/src/Conllu/Print.hs
+++ b/src/Conllu/Print.hs
@@ -9,151 +9,119 @@
 --
 -- prints CoNLL-U.
 
+{-# LANGUAGE OverloadedStrings #-}
 module Conllu.Print
-  ( printDoc
-  , printSent )
 where
 
-import qualified Conllu.DeprelTagset as D
 import           Conllu.Type
-import           Conllu.Utils
-
-import           Data.List
-import           Data.Maybe
-import           Data.Semigroup
-import           Data.Monoid (Monoid(mempty, mappend))
-
--- TODO: use some kind of bi-directional thing to derive this module
-
--- | Functional list type from LYHGG, see HUGHES, RJM. "A novel
--- representation of lists and its application to the function
--- 'reverse'"
-newtype FList a = FList { getFList :: [a] -> [a] }
-
-instance Semigroup (FList a) where
-  (FList f) <> (FList g) = FList (f . g)
-
-instance Monoid (FList a) where
-  mempty = FList (\xs -> [] ++ xs)
-  a `mappend` b = a <> b
-
-toFList :: [a] -> FList a
-toFList xs = FList (xs++)
-
-fromFList :: FList a -> [a]
-fromFList (FList f) = f []
+import           Data.List  (group, transpose ) --, concat)
+import           Data.Maybe (fromMaybe, fromJust) --, listToMaybe)
+import qualified Data.Text      as T
 
 ---
 -- printing
-printDoc :: Doc -> String
--- | prints a list of sentences to a string.
-printDoc =
-  fromFList . mconcat . map (\s -> printSent' s `mappend` diffLSpace)
-
-printSent :: Sent -> String
--- | prints a sentence to a string.
-printSent = fromFList . printSent'
-
-printSent' :: Sent -> FList Char
-printSent' ss =
-  mconcat
-    [ printComments (_meta ss)
-    , diffLSpace
-    , printWs (_words ss)
-    ]
-
-printComments :: [Comment] -> FList Char
-printComments =
-  toFList .
-  intercalate "\n" .
-  map
-    (\(c1, c2) ->
-       concat
-         [ "# "
-         , c1
-         , if null c2
-             then ""
-             else "= " ++ c2
-         ])
-
-printWs :: [CW a] -> FList Char
-printWs = foldr (\w dl -> mconcat [printW w, diffLSpace, dl]) mempty
-
-printW :: CW a -> FList Char
-printW = printW'
-  where
-    printW' :: CW a -> FList Char
-    printW' w =
-      wordLine w
-        [ printID'
-        , printFORM
-        , printLEMMA
-        , printUPOS'
-        , printXPOS
-        , printFEATS'
-        , printHEAD
-        , printDEPREL'
-        , printDEPS'
-        , printMISC
-        ]
-    wordLine :: CW a -> [CW a -> String] -> FList Char
-    wordLine w = toFList . intercalate "\t" . map (\f -> f w)
-    printID' = printID . _id
-    printMStr = fromMaybe "_"
-    printFORM = printMStr . _form
-    printLEMMA = printMStr . _lemma
-    printUPOS' = printUPOS . _upos
-    printXPOS = printMStr . _xpos
-    printFEATS' = printFEATS . _feats
-    printHEAD = maybe "_" (printID . _head) . _rel
-    printDEPREL' =
-      maybe "_" (\r -> printDEPREL (_deprel r) (_subdep r)) . _rel
-    printDEPS' = printDEPS . _deps
-    printMISC = printMStr . _misc
-
+printDoc :: Doc -> T.Text
+-- | prepares a list of sentences for printing:
+printDoc doc =  T.intercalate "\n" $ T.intercalate "\t" <$> (concat $ printSent <$> doc) 
+
+printSent :: Sent -> [[T.Text]]
+-- | prepares a sentence for printing
+-- | transpose regroups the sentence fields into a list of word lines
+--   Sentences consist of one or more word lines, and word lines contain the fields in printWords
+printSent sentence =  group     (printComment <$> _meta  sentence)
+                   <> transpose (printWords    $  _words sentence)
+
+printComment :: Comment -> T.Text
+-- | reconstructs a comment line from its parsed representation
+--   "# first comment < = second comment>"
+-- TODO GVF fix comment parse -- should not include space 
+printComment comment = "# "
+                     <> fst comment
+                     <> if T.null $ snd comment
+                        then ""
+                        else " = "
+                     <> snd comment
+
+printWords :: [CW a]  ->  [[T.Text]]
+-- | reconstructs the 10 fields from parsed representation
+printWords cw =  (<$> cw) <$>
+                [ printID        . _id
+                , fromMaybe "_"  . _form 
+                , fromMaybe "_"  . _lemma 
+                , printUPOS      . _upos  
+                , printXPOS      . _xpos  
+                , printFEATS     . _feats 
+                , printHEAD      . _rel   
+                , printDEPREL    . _rel   
+--                , printDEPS    . _deps  
+                , printMISC      . _misc]
 ---
 -- field printers
-printID :: ID -> String
+printID :: ID -> T.Text
 printID id' =
-  case id' of
-    SID i -> show i
-    MID s e -> concat [show s, "-", show e]
-    EID i e -> concat [show i, ".", show e]
+    case id' of
+      SID id1     -> T.pack $ show id1
+      MID id1 id2 -> T.pack $ show id1 <> "-" <>  show id2
+      EID id1 id2 -> T.pack $ show id1 <> "." <>  show id2
 
-printUPOS :: UPOS -> String
-printUPOS Nothing = "_"
-printUPOS (Just pos) = show pos
+printUPOS :: UPOS -> T.Text
+printUPOS    upos =  T.pack $ maybe "_" show upos
 
-printFEATS :: FEATS -> String
+printXPOS :: XPOS -> T.Text
+printXPOS    xpos =  T.toLower . T.pack $ maybe "_" show xpos
+
+printFEATS :: FEATS -> T.Text
 printFEATS = printList printFeat
   where
     printFeat Feat {_feat = f, _featValues = vs, _featType = mft} =
-      let fts = maybe "" (\ft -> "[" ++ ft ++ "]") mft
-      in concat [f, fts, "=", intercalate "," vs]
-
-printDEPREL :: D.EP -> Maybe String -> String
-printDEPREL dr sdr =
-  downcaseStr $ show dr ++ maybe "" (":" ++) sdr
-
-printDEPS :: DEPS -> String
-printDEPS =
-  printList
-    (\r ->
-       intercalate
-         ":"
-         ([printID (_head r), printDEPREL (_deprel r) (_subdep r)] ++
-          fromMaybe [] (_rest r)))
+      let fts = maybe "" (\ft -> "[" <> ft <> "]") mft
+      in f <> fts <> "=" <> T.intercalate "," vs
+
+printHEAD :: Maybe Rel -> T.Text
+printHEAD    Nothing   =  "_"
+printHEAD   (Just rel) = printID (_head rel)
+
+printDEPREL :: Maybe Rel -> T.Text
+printDEPREL   rel = deprel <> subdep
+  where
+    deprel = T.toLower . T.pack $ maybe "_" (show . _deprel) rel
+    subdep =  maybe "" (maybe "" (":" <>) . _subdep) rel
+
+
+{-
+printDEPS :: DEPS -> T.Text -- DEPS = [REL]
+printDEPS deps = (\x y z -> T.intercalate ":" [x,y,z]) 
+              <$> printHEAD 
+              <*> printDEPREL 
+              <*> (T.intercalate ":" <$> printREST) 
+              <$> listToMaybe . deps
+-}
+
+
+{-
+ (<$> (listToMaybe <$> deps)) <$> [
+  printHEAD, printDEPREL, T.intercalate ":" <$> printREST]
+  
+(\x y z -> T.intercalate ":" [x,y,z]) <$> printHEAD <*> printDEPREL <*> (T.intercalate ":" <$> printREST) <$> (listToMaybe <$> (_deps <$> cw))
+ 
+-}
+
+printREST :: Maybe Rel -> [T.Text]
+printREST = fromMaybe [] <$> _rest . fromJust
+
+printMISC :: MISC -> T.Text
+printMISC = T.pack . maybe "" show
 
 ---
 -- utility printers
-printList :: (a -> String) -> [a] -> String
-printList f = nullToStr . intercalate "|" . map f
+printList :: (a -> T.Text) -> [a] -> T.Text
+printList f = nullToStr . T.intercalate "|" . map f
   where
-    nullToStr :: String -> String
+    nullToStr :: T.Text -> T.Text
     nullToStr xs =
-      if null xs
+      if T.null xs
         then "_"
         else xs
 
-diffLSpace :: FList Char
-diffLSpace = toFList "\n"
+
+
diff --git a/src/Conllu/Test.hs b/src/Conllu/Test.hs
index e3b276d..b3ef348 100644
--- a/src/Conllu/Test.hs
+++ b/src/Conllu/Test.hs
@@ -1,6 +1,6 @@
-module Test where
+module Conllu.Test where
 
-import Conllu.Type
-import Conllu.IO
-import Conllu.Parse
-import Conllu.Print
+import Conllu.Type()
+import Conllu.IO()
+import Conllu.Parse()
+import Conllu.Print()
diff --git a/src/Conllu/Type.hs b/src/Conllu/Type.hs
index 30e394d..694d7d2 100644
--- a/src/Conllu/Type.hs
+++ b/src/Conllu/Type.hs
@@ -18,22 +18,42 @@ module Conllu.Type where
 import           Conllu.Utils
 import qualified Conllu.UposTagset as U
 import qualified Conllu.DeprelTagset as D
+import           Data.Text
+import           Data.Ord (comparing)
 
-import           Data.Ord
 
 ---
 -- * type and data declarations
 -- ** Documents and Sentences
 type Doc = [Sent]
+type Par = [Sent]
+
+{- | Sentence Boundaries
+      There must be exactly one blank line after every sentence, including the last sentence in the file.
+      Empty sentences are not allowed.
+      Lines starting with the # character and preceding a sentence are considered as carrying comments or metadata
+      relevant to the following sentence.
+            TODO add a command line option to ignore comments and or metadata
+      The contents of the comments and metadata is basically unrestricted, may be application specific
+            TODO describe how application specific comment and metadata parsing can be accomplished
+      Two comments are compulsory
+      1) A unique sentence id (# sent_id = ). The actual identifier does not contain whitespace characters
+         (while the comment line may contain whitespace around the sent_id keyword and the equals-to sign)
+         In sentence ids, the slash character (“/”) is reserved for specialized downstream use.
+      2) The unannotated sentence as a single string (# text = ...), (# translit = ...) for transliterations and
+         # text_en = ....
+-}
 
 data Sent = Sent
-  { _meta  :: [Comment]  -- ^ the sentence's comments.
-  , _words :: [CW AW] -- ^ the sentence's words.
-  } deriving (Eq, Show)
+  {
+   _meta  :: [Comment]  -- ^ the sentence's comments.
+  ,
+   _words :: [CW AW]    -- ^ the sentence's words.
+  } -- deriving (Show) -- , Eq)
 
--- | most comments are (key, value) pairs.
+-- | Comments can be (key, value) pairs. #\s[A-Za-z0-9-_]+\s\=\s(.)*$
 type Comment    = StringPair
-type StringPair = (String, String)
+type StringPair = (Text, Text)  -- TODO perhaps a Data.Map (Key,Text)
 
 -- ** Words
 -- | represents a word line in a CoNLL-U file. note that we have
@@ -41,13 +61,15 @@ type StringPair = (String, String)
 -- combined as a relation type Rel accessible by the '_rel' function;
 -- the 'DEPS' field is merely a list of 'Rel'.
 --
--- a C(oNLL-U)W(ord) may be a simple word, a multi-word token, or an
--- empty node. this is captured by the phantom type (the `a` in the
--- declaration), which can be parametrized by one of the data types
--- below in order to build functions that only operate on one of these
--- word types (see 'mkSWord' on how to do this). see the '_dep'
--- function, which only operates on simple words, which are the ones
--- that have a DEPREL field.
+--   a C(oNLL-U)W(ord) may be a simple word, a multi-word token, or an
+--   empty node. this is captured by the phantom type (the `a` in the
+--   declaration), which can be parametrized by one of the data types
+--   below in order to build functions that only operate on one of these
+--   word types (see 'mkSWord' on how to do this). see the '_dep'
+--   function, which only operates on simple words, which are the ones
+--   that have a DEPREL field.
+{-  TODO In the Parse Module there is an equivalent data type for ParserC
+       perhaps this can be done as a transformation CW -> Parser CW -}
 data CW a = CW
   { _id    :: ID        -- ^ ID field
   , _form  :: FORM      -- ^ FORM field
@@ -58,82 +80,90 @@ data CW a = CW
   , _rel   :: Maybe Rel -- ^ combined HEAD and DEPREL fields
   , _deps  :: DEPS      -- ^ DEPS field
   , _misc  :: MISC      -- ^ MISC field
-  } deriving (Eq, Show)
+  } deriving (Show, Eq)
+
+-- | To ensure token ids follow the order described in the Word, Tokens and Empty Nodes
+--   the Ord class is used for totally ordered datatypes
+--   Conllu Words, Tokens and Empty Nodes <https://universaldependencies.org/format.html#words-tokens-and-empty-nodes>
 
 instance Ord (CW a) where
   compare = comparing _id
 
 -- *** Word types
 -- | phantom type for any kind of word.
-data AW
+data AW -- anyWord
 -- | phantom type for a simple word.
-data SW
+data SW -- simpleWord
 -- | phantom type for multiword tokens. do note that in MWTs only the
 -- 'ID', 'FORM' and 'MISC' fields may be non-empty.
-data MT
--- | phantom type for an empty node.
-data EN
+data MT --multiWord
+-- | phantom type for an empty node. i.e. an inferred word not actually in sentence,
+--   i.e. Sue likes coffee and Bill (likes) tea, where (likes) is an empty node.
+data EN -- Empty Node
 
 -- *** Word Fields
-data ID -- | Word ID field.
+data ID -- | Word ID field.  -- TODO tuple?? (Index,Index)
   = SID Index -- ^ word ID is an integer
   | MID Index
-        Index -- ^ multi-word token ID is a range
+        Index -- ^ multi-word token ID is a range, which may not overlap TODO [Index ... Index]
   | EID Index
-        Index -- ^ empty node ID is a decimal
+        Index -- ^ empty node ID is a decimal, where 5.9 < 5.10
   deriving (Eq, Show)
 
+type Index   = Int
+
+
 instance Ord ID where
   compare = idOrd
     where
       idOrd :: ID -> ID -> Ordering
-      idOrd (SID x) (SID y) = compare x y
+      idOrd (SID x) (SID y) = compare x y  -- x and y are Int
       idOrd id1 id2 =
         let c = comparing fstIx id1 id2
         in case c of
           EQ -> sameIx id1 id2
           _ -> c
         where
-          fstIx :: ID -> Index
-          fstIx (SID i) = i
-          fstIx (MID i _ei) = i
-          fstIx (EID i _ei) = i
+          fstIx :: ID -> Index  -- Convert an ID into an Index
+          fstIx (SID i) = i     -- SID is an Int
+          fstIx (MID i _ei) = i -- MID is a range, e.g. 3-4
+          fstIx (EID i _ei) = i -- EID is a decimal > 0 . e.g. 0.2,
           sndIx :: ID -> Index
           sndIx (EID _s e) = e
           sndIx (MID _s e) = e
+          sndIx (SID i ) = i -- there is no sndIX
           sameIx :: ID -> ID -> Ordering
           sameIx (SID _) _id = GT
           sameIx _id (SID _) = LT
           -- reverse ID order so that MID 1 4 comes before MID 1 2:
           sameIx i1 i2 = comparing sndIx i2 i1
 
-type FORM  = Maybe String
-type LEMMA = Maybe String
-type UPOS  = Maybe U.POS
-type XPOS  = Maybe String
-type FEATS = [Feat]
-type HEAD  = ID
-type DEPS  = [Rel]
-type MISC  = Maybe String
+type FORM  = Maybe Text  -- ^ Word form or punctuation
+type LEMMA = Maybe Text  -- ^ Lemma or Stem Word
+type UPOS  = Maybe U.POS -- ^ Universal Part of Speech
+type XPOS  = Maybe Text  -- ^ language specific part of speech
+type FEATS = [Feat]      -- ^ language specific features in data.feats.json
+type HEAD  = ID          -- ^ Head of the current word -- tree structure -- root = 0 catena order
+type DEPS  = [Rel]       -- ^ depRel Universal Dependency Relation or language specific subset
+type MISC  = Maybe Text  -- ^ Any other information
 
 -- | feature representation
+--   Todo evaluate against the Universal Dependency python Verify.FeatureSet
 data Feat = Feat
-  { _feat       :: String       -- ^ feature name
-  , _featValues :: [String]     -- ^ feature values
-  , _featType   :: Maybe String -- ^ feature type (inside brackets).
-  } deriving (Eq, Show) -- add manual Ord instance?
+  { _feat       :: Text       -- ^ feature name
+  , _featValues :: [Text]     -- ^ feature values
+  , _featType   :: Maybe Text -- ^ feature type (inside brackets).
+  } deriving (Show, Eq )   -- add manual Ord instance?
 
 -- | dependency relation representation.
 data Rel = Rel
-  { _head :: HEAD -- ^ head 'ID'
-  , _deprel :: D.EP -- ^ dependency relation type
-  , _subdep :: Maybe String -- ^ dependency relation subtype
-  , _rest :: Maybe [String] -- ^ provisitonal, see issues #23,#17
-  } deriving (Eq, Show)
+  { _head :: HEAD         -- ^ head 'ID'
+  , _deprel :: D.EP       -- ^ dependency relation type
+  , _subdep :: Maybe Text -- ^ dependency relation subtype
+  , _rest :: Maybe [Text] -- ^ provisitonal, see issues #23,#17
+  }  deriving (Show, Eq)  -- Needed for Data CW definition
+
 
-type Index   = Int
--- | 'ID' separator in meta words
-type IxSep   = Char
 
 ---
 -- ** accessor functions
@@ -143,18 +173,21 @@ _dep w = Just . _deprel =<< _rel w
 
 depIs :: D.EP -> CW SW -> Bool
 -- | check if DEP is the one provided.
-depIs d = maybe False (d ==) . _dep
+depIs d = (Just d ==) . _dep
 
----
+--- The following are no longer required:
+{-
 -- ** constructor functions
-mkDEP :: String -> D.EP
+mkDEP ::  Text -> Maybe D.EP
 -- | read a main DEPREL (no subtype).
-mkDEP = read . upcaseStr
+mkDEP = readMaybe . toString . toUppoer -- upcaseStr'
 
-mkUPOS :: String -> U.POS
+mkUPOS :: Text -> Maybe U.POS
 -- | read an 'UPOS' tag.
-mkUPOS = read . upcaseStr
+mkUPOS =  readMaybe . toString . toUpper -- upcaseStr'
+-}
 
+{--
 -- words
 mkAW :: ID -> FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> Maybe Rel
   -> DEPS -> MISC -> CW AW
@@ -173,12 +206,12 @@ mkSW CW { _id = i
         , _rel = r
         , _deps = ds
         , _misc = m
-        } = CW i f l u x fs r ds m
-
+ = CW i f l u x fs r ds m
+--}
 {-- saved for a future validation module
 ---
 -- validation
-mTkOK :: FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> (ID, (Dep, Maybe String))
+mTkOK :: FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> (ID, (Dep, Maybe Text))
   -> Bool
 mTkOK fo l up xp fe h dr d =
   assSomething fo $
@@ -190,4 +223,4 @@ mTkOK fo l up xp fe h dr d =
 eTkOK :: Dephead -> DepRel -> Deps -> Bool
 eTkOK h dr d =
   assNothing h $ assNothing dr $ (assert . not . null $ d) True
---}
+--}
\ No newline at end of file
diff --git a/src/Conllu/UposTagset.hs b/src/Conllu/UposTagset.hs
index 74e1445..699af16 100644
--- a/src/Conllu/UposTagset.hs
+++ b/src/Conllu/UposTagset.hs
@@ -13,24 +13,24 @@
 -- import qualified Conllu.UposTagset as U
 -- @
 
-module Conllu.UposTagset where
+module Conllu.UposTagset where  
 
 data POS
-  = ADJ
-  | ADP
-  | ADV
-  | AUX
-  | CCONJ
-  | DET
-  | INTJ
-  | NOUN
-  | NUM
-  | PART
-  | PRON
-  | PROPN
-  | PUNCT
-  | SCONJ
-  | SYM
-  | VERB
-  | X
-  deriving (Enum, Eq, Read, Show)
+  = ADJ   --  adjective
+  | ADP   --  adposition
+  | ADV   --  adverb
+  | AUX   --  auxiliary
+  | CCONJ --  coordinating conjunction
+  | DET   --  determiner
+  | INTJ  --  interjection
+  | NOUN  --  noun
+  | NUM   --  numeral
+  | PART  --  particle
+  | PRON  --  pronoun
+  | PROPN --  proper noun
+  | PUNCT --  punctuation
+  | SCONJ --  subordinating conjunction
+  | SYM   --  symbol
+  | VERB  --  verb
+  | X     --  other
+ deriving (Enum, Eq, Read, Show ) 
diff --git a/src/Conllu/Utils.hs b/src/Conllu/Utils.hs
index 94750b1..bbb1feb 100644
--- a/src/Conllu/Utils.hs
+++ b/src/Conllu/Utils.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
 -- |
 -- Module      :  Conllu.Utils
 -- Copyright   :  © 2018 bruno cuconato
@@ -9,22 +10,39 @@
 --
 -- the library's utility functions.
 
+-- TODO add export List
+
 module Conllu.Utils where
 
-import Control.Exception.Base
-import Data.Char
-import Data.Maybe
+
+-- import           Control.Exception.Base
+-- import           Data.Char
+import           Data.Maybe
+-- import qualified Data.Text              as T
 
 ---
 -- char functions
+-- TODO (map toUpper doesn’t work for Unicode because the result could have a different length)
+
+{-
 upcaseStr :: String -> String
 upcaseStr = map toUpper
 
 downcaseStr :: String -> String
 downcaseStr = map toLower
 
+
+upcaseStr' :: Text -> Text
+upcaseStr' =  toUpper
+
+downcaseStr' :: Text -> Text
+downcaseStr' =  toLower
+-}
+
 ---
 -- assertions
+
+{-
 assNothing :: Maybe a -> Bool -> Bool
 assNothing m = assert (isNothing m)
 
@@ -33,6 +51,7 @@ assSomething m = assert (isJust m)
 
 assNull :: [a] -> Bool -> Bool
 assNull l = assert (null l)
+-}
 
 ---
 -- function tools
@@ -50,7 +69,7 @@ consIf p a as =
 ---
 -- safe functions
 safehead :: [a] -> Maybe a
-safehead [] = Nothing
+safehead []    = Nothing
 safehead (x:_) = Just x
 
 safeRead :: Read a => String -> Maybe a