Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Hs-conllu.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ library
exposed-modules: Conllu.Type, Conllu.Data.Tree, Conllu.Data.Token, Conllu.IO, Conllu.Parse, Conllu.Diff, Conllu.Utils
--other-modules: Conllu.Utils
-- other-extensions:
build-depends: base >=4.9 && <5, containers >0.5.8 && <0.6, directory >=1.3 && <1.4, filepath >=1.4 && <1.5, parsec >=3.1 && <3.2
build-depends: base >=4.9 && <5, containers >0.5.8 && <0.6, directory >=1.3 && <1.4, filepath >=1.4 && <1.5, megaparsec >=6 && <7, void <1, text >=1 && <2
hs-source-dirs: src
default-language: Haskell2010
8 changes: 4 additions & 4 deletions src/Conllu/IO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@ import System.Directory
import System.Environment
import System.FilePath
import System.IO

import Text.Parsec.String
import qualified Text.Megaparsec as M

---
-- uses customized parser
readConlluFileWith :: Parser [Sentence] -> FilePath -> IO Document
readConlluFileWith p f = do
r <- parseFromFile p f
d <- readFile f
let r = M.parse p f d
case r of
Left err -> do
print err
putStr . M.parseErrorPretty $ err
return $ Document f []
Right ss -> return $ Document (takeFileName f) ss

Expand Down
45 changes: 24 additions & 21 deletions src/Conllu/Parse.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE OverloadedStrings #-}
module Conllu.Parse where

{--
Expand All @@ -21,19 +22,21 @@ import Data.List
import Data.Maybe
import System.Environment
import System.IO
import Data.Void
import qualified Data.Text as T

import Text.Parsec hiding (token)
import Text.Parsec.Combinator
import Text.Parsec.String
import Text.ParserCombinators.Parsec.Char
import qualified Text.Megaparsec as M
import Text.Megaparsec.Char

type Parser = M.Parsec Void Stream

---
-- conllu parsers
document :: Parser [Sentence]
document = documentC sentence

documentC :: Parser Sentence -> Parser [Sentence]
documentC s = endBy1 s blankLine <* eof
documentC s = M.endBy1 s blankLine <* M.eof

blankLine :: Parser ()
blankLine = litSpaces <* newline -- spaces shouldn't exist, but no
Expand All @@ -43,7 +46,7 @@ sentence :: Parser Sentence
sentence = sentenceC comment token

sentenceC :: Parser Comment -> Parser Token -> Parser Sentence
sentenceC c t = liftM2 Sentence (many c) (many1 t)
sentenceC c t = liftM2 Sentence (M.many c) (M.some t)

comment :: Parser Comment
comment = do char '#'
Expand All @@ -59,8 +62,8 @@ tokenC :: Parser Index -> Parser IxSep -> Parser Form -> Parser Lemma
-> Parser Token
tokenC ix is fo l up xp fe dh dr ds m =
mkToken <$> ix
<*> optionMaybe is
<*> optionMaybe ix <* tab
<*> M.optional is
<*> M.optional ix <* tab
<*> fo <* tab
<*> l <* tab
<*> up <* tab
Expand All @@ -76,11 +79,11 @@ emptyField = do char '_'
return Nothing

index :: Parser Index
index = do ix <- many1 digit
index = do ix <- M.some digitChar
return (read ix :: Index)

indexSep :: Parser IxSep
indexSep = choice [char '-', char '.']
indexSep = char '-' M.<|> char '.'

form :: Parser Form
form = maybeEmpty stringWSpaces
Expand Down Expand Up @@ -108,11 +111,11 @@ deprel = maybeEmpty deprel'

deprel' :: Parser (Dep, Subtype)
deprel' = do dep <- dep
st <- option [] $ char ':' *> many1 letter
st <- M.option [] $ char ':' *> M.some letterChar
return (dep,st)
where
dep :: Parser Dep
dep = liftM mkDep $ many1 letter
dep = liftM mkDep $ M.some letterChar

deps :: Parser Deps
deps = listP $ listPair ':' index deprel'
Expand All @@ -124,38 +127,38 @@ misc = maybeEmpty stringWSpaces
-- utility parsers
litSpaces :: Parser ()
-- because spaces consumes \t and \n
litSpaces = skipMany $ char ' '
litSpaces = M.skipMany $ char ' '

commentPair :: Parser Comment
commentPair =
keyValue '=' (stringNot "=\n\t") (option [] stringWSpaces)
keyValue '=' (stringNot "=\n\t") (M.option [] stringWSpaces)

listPair :: Char -> Parser a -> Parser b -> Parser [(a, b)]
listPair sep p q = sepBy1 (keyValue sep p q) (char '|')
listPair sep p q = M.sepBy1 (keyValue sep p q) (char '|')

stringNot :: String -> Parser String
stringNot :: Stream -> Parser Stream
-- [ ] second litSpaces in symbol is redundant
stringNot s = symbol . many1 $ noneOf s
stringNot s = symbol . M.some $ satisfy (\c -> not $ c `elem` s)

stringWSpaces :: Parser String
stringWSpaces :: Parser Stream
stringWSpaces = stringNot "\t\n"

stringWOSpaces :: Parser String
stringWOSpaces :: Parser Stream
stringWOSpaces = stringNot " \t\n"

---
-- parser combinators
keyValue :: Char -> Parser a -> Parser b -> Parser (a, b)
keyValue sep p q = do key <- p
optional $ char sep
M.optional $ char sep
value <- q
return (key, value)

symbol :: Parser a -> Parser a
symbol p = litSpaces *> p <* litSpaces

maybeEmpty :: Parser a -> Parser (Maybe a)
maybeEmpty p = emptyField <|> liftM Just p
maybeEmpty p = emptyField M.<|> liftM Just p

listP :: Parser [a] -> Parser [a]
-- using a parser that returns a possibly empty list like sepBy and
Expand Down
34 changes: 19 additions & 15 deletions src/Conllu/Type.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE OverloadedStrings #-}
module Conllu.Type where

---
Expand All @@ -10,11 +11,14 @@ import Data.List
import Data.Maybe
import Data.Ord
import Data.Tree
import qualified Data.Text as T

---
-- type and data declarations
type Stream = T.Text

data Document = Document
{ _file :: String
{ _file :: Stream
, _sents :: [Sentence]
} deriving (Eq,Show)

Expand All @@ -23,8 +27,8 @@ data Sentence = Sentence
, _tokens :: [Token]
} deriving (Eq, Show)

type Comment = StringPair
type StringPair = (String, String)
type Comment = StreamPair
type StreamPair = (Stream, Stream)

data Token
= SToken { _ix :: Index
Expand Down Expand Up @@ -59,16 +63,16 @@ data Token

type Index = Int
type IxSep = Char
type Form = Maybe String
type Lemma = Maybe String
type Form = Maybe T.Text
type Lemma = Maybe T.Text
type PosTag = Maybe Pos
type Xpostag = Maybe String
type Feats = [StringPair]
type Xpostag = Maybe T.Text
type Feats = [StreamPair]
type Dephead = Maybe Index
type DepRel = Maybe (Dep,Subtype)
type Subtype = String
type Subtype = T.Text
type Deps = [(Index,(Dep,Subtype))]
type Misc = Maybe String
type Misc = Maybe T.Text

_dep :: Token -> Maybe Dep
_dep = dep . _deprel
Expand Down Expand Up @@ -147,8 +151,8 @@ type ETree = (TTree, [Token]) -- enhanced tree

---
-- constructor functions
mkDep :: String -> Dep
mkDep = mkDep' . upcaseString
mkDep :: Stream -> Dep
mkDep = mkDep' . upcaseStream
where
mkDep' "ACL" = ACL
mkDep' "ADVCL" = ADVCL
Expand Down Expand Up @@ -188,8 +192,8 @@ mkDep = mkDep' . upcaseString
mkDep' "VOCATIVE" = VOCATIVE
mkDep' "XCOMP" = XCOMP

mkPos :: String -> Pos
mkPos = mkPos' . upcaseString
mkPos :: Stream -> Pos
mkPos = mkPos' . upcaseStream
where
mkPos' "ADJ" = ADJ
mkPos' "ADP" = ADP
Expand Down Expand Up @@ -270,8 +274,8 @@ eTokenOK h dr d = assNothing h $ assNothing dr $ assSomething d True

---
-- utility functions
upcaseString :: String -> String
upcaseString = map toUpper
upcaseStream :: Stream -> Stream
upcaseStream = T.toUpper

assNothing :: Maybe a -> Bool -> Bool
assNothing m = assert (isNothing m)
Expand Down