diff --git a/Hs-conllu.cabal b/Hs-conllu.cabal index 85f2c0f..12a28fd 100644 --- a/Hs-conllu.cabal +++ b/Hs-conllu.cabal @@ -17,6 +17,6 @@ library exposed-modules: Conllu.Type, Conllu.Data.Tree, Conllu.Data.Token, Conllu.IO, Conllu.Parse, Conllu.Diff, Conllu.Utils --other-modules: Conllu.Utils -- other-extensions: - build-depends: base >=4.9 && <5, containers >0.5.8 && <0.6, directory >=1.3 && <1.4, filepath >=1.4 && <1.5, parsec >=3.1 && <3.2 + build-depends: base >=4.9 && <5, containers >0.5.8 && <0.6, directory >=1.3 && <1.4, filepath >=1.4 && <1.5, megaparsec >=6 && <7, void <1, text >=1 && <2 hs-source-dirs: src default-language: Haskell2010 diff --git a/src/Conllu/IO.hs b/src/Conllu/IO.hs index a80e00d..5ff6f4b 100644 --- a/src/Conllu/IO.hs +++ b/src/Conllu/IO.hs @@ -10,17 +10,17 @@ import System.Directory import System.Environment import System.FilePath import System.IO - -import Text.Parsec.String +import qualified Text.Megaparsec as M --- -- uses customized parser readConlluFileWith :: Parser [Sentence] -> FilePath -> IO Document readConlluFileWith p f = do - r <- parseFromFile p f + d <- readFile f + let r = M.parse p f d case r of Left err -> do - print err + putStr . M.parseErrorPretty $ err return $ Document f [] Right ss -> return $ Document (takeFileName f) ss diff --git a/src/Conllu/Parse.hs b/src/Conllu/Parse.hs index ac9008f..ff0b273 100644 --- a/src/Conllu/Parse.hs +++ b/src/Conllu/Parse.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} module Conllu.Parse where {-- @@ -21,11 +22,13 @@ import Data.List import Data.Maybe import System.Environment import System.IO +import Data.Void +import qualified Data.Text as T -import Text.Parsec hiding (token) -import Text.Parsec.Combinator -import Text.Parsec.String -import Text.ParserCombinators.Parsec.Char +import qualified Text.Megaparsec as M +import Text.Megaparsec.Char + +type Parser = M.Parsec Void Stream --- -- conllu parsers @@ -33,7 +36,7 @@ document :: Parser [Sentence] document = documentC sentence documentC :: Parser Sentence -> Parser [Sentence] -documentC s = endBy1 s blankLine <* eof +documentC s = M.endBy1 s blankLine <* M.eof blankLine :: Parser () blankLine = litSpaces <* newline -- spaces shouldn't exist, but no @@ -43,7 +46,7 @@ sentence :: Parser Sentence sentence = sentenceC comment token sentenceC :: Parser Comment -> Parser Token -> Parser Sentence -sentenceC c t = liftM2 Sentence (many c) (many1 t) +sentenceC c t = liftM2 Sentence (M.many c) (M.some t) comment :: Parser Comment comment = do char '#' @@ -59,8 +62,8 @@ tokenC :: Parser Index -> Parser IxSep -> Parser Form -> Parser Lemma -> Parser Token tokenC ix is fo l up xp fe dh dr ds m = mkToken <$> ix - <*> optionMaybe is - <*> optionMaybe ix <* tab + <*> M.optional is + <*> M.optional ix <* tab <*> fo <* tab <*> l <* tab <*> up <* tab @@ -76,11 +79,11 @@ emptyField = do char '_' return Nothing index :: Parser Index -index = do ix <- many1 digit +index = do ix <- M.some digitChar return (read ix :: Index) indexSep :: Parser IxSep -indexSep = choice [char '-', char '.'] +indexSep = char '-' M.<|> char '.' form :: Parser Form form = maybeEmpty stringWSpaces @@ -108,11 +111,11 @@ deprel = maybeEmpty deprel' deprel' :: Parser (Dep, Subtype) deprel' = do dep <- dep - st <- option [] $ char ':' *> many1 letter + st <- M.option [] $ char ':' *> M.some letterChar return (dep,st) where dep :: Parser Dep - dep = liftM mkDep $ many1 letter + dep = liftM mkDep $ M.some letterChar deps :: Parser Deps deps = listP $ listPair ':' index deprel' @@ -124,30 +127,30 @@ misc = maybeEmpty stringWSpaces -- utility parsers litSpaces :: Parser () -- because spaces consumes \t and \n -litSpaces = skipMany $ char ' ' +litSpaces = M.skipMany $ char ' ' commentPair :: Parser Comment commentPair = - keyValue '=' (stringNot "=\n\t") (option [] stringWSpaces) + keyValue '=' (stringNot "=\n\t") (M.option [] stringWSpaces) listPair :: Char -> Parser a -> Parser b -> Parser [(a, b)] -listPair sep p q = sepBy1 (keyValue sep p q) (char '|') +listPair sep p q = M.sepBy1 (keyValue sep p q) (char '|') -stringNot :: String -> Parser String +stringNot :: Stream -> Parser Stream -- [ ] second litSpaces in symbol is redundant -stringNot s = symbol . many1 $ noneOf s +stringNot s = symbol . M.some $ satisfy (\c -> not $ c `elem` s) -stringWSpaces :: Parser String +stringWSpaces :: Parser Stream stringWSpaces = stringNot "\t\n" -stringWOSpaces :: Parser String +stringWOSpaces :: Parser Stream stringWOSpaces = stringNot " \t\n" --- -- parser combinators keyValue :: Char -> Parser a -> Parser b -> Parser (a, b) keyValue sep p q = do key <- p - optional $ char sep + M.optional $ char sep value <- q return (key, value) @@ -155,7 +158,7 @@ symbol :: Parser a -> Parser a symbol p = litSpaces *> p <* litSpaces maybeEmpty :: Parser a -> Parser (Maybe a) -maybeEmpty p = emptyField <|> liftM Just p +maybeEmpty p = emptyField M.<|> liftM Just p listP :: Parser [a] -> Parser [a] -- using a parser that returns a possibly empty list like sepBy and diff --git a/src/Conllu/Type.hs b/src/Conllu/Type.hs index 75620ba..a4df8a3 100644 --- a/src/Conllu/Type.hs +++ b/src/Conllu/Type.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} module Conllu.Type where --- @@ -10,11 +11,14 @@ import Data.List import Data.Maybe import Data.Ord import Data.Tree +import qualified Data.Text as T --- -- type and data declarations +type Stream = T.Text + data Document = Document - { _file :: String + { _file :: Stream , _sents :: [Sentence] } deriving (Eq,Show) @@ -23,8 +27,8 @@ data Sentence = Sentence , _tokens :: [Token] } deriving (Eq, Show) -type Comment = StringPair -type StringPair = (String, String) +type Comment = StreamPair +type StreamPair = (Stream, Stream) data Token = SToken { _ix :: Index @@ -59,16 +63,16 @@ data Token type Index = Int type IxSep = Char -type Form = Maybe String -type Lemma = Maybe String +type Form = Maybe T.Text +type Lemma = Maybe T.Text type PosTag = Maybe Pos -type Xpostag = Maybe String -type Feats = [StringPair] +type Xpostag = Maybe T.Text +type Feats = [StreamPair] type Dephead = Maybe Index type DepRel = Maybe (Dep,Subtype) -type Subtype = String +type Subtype = T.Text type Deps = [(Index,(Dep,Subtype))] -type Misc = Maybe String +type Misc = Maybe T.Text _dep :: Token -> Maybe Dep _dep = dep . _deprel @@ -147,8 +151,8 @@ type ETree = (TTree, [Token]) -- enhanced tree --- -- constructor functions -mkDep :: String -> Dep -mkDep = mkDep' . upcaseString +mkDep :: Stream -> Dep +mkDep = mkDep' . upcaseStream where mkDep' "ACL" = ACL mkDep' "ADVCL" = ADVCL @@ -188,8 +192,8 @@ mkDep = mkDep' . upcaseString mkDep' "VOCATIVE" = VOCATIVE mkDep' "XCOMP" = XCOMP -mkPos :: String -> Pos -mkPos = mkPos' . upcaseString +mkPos :: Stream -> Pos +mkPos = mkPos' . upcaseStream where mkPos' "ADJ" = ADJ mkPos' "ADP" = ADP @@ -270,8 +274,8 @@ eTokenOK h dr d = assNothing h $ assNothing dr $ assSomething d True --- -- utility functions -upcaseString :: String -> String -upcaseString = map toUpper +upcaseStream :: Stream -> Stream +upcaseStream = T.toUpper assNothing :: Maybe a -> Bool -> Bool assNothing m = assert (isNothing m)