Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions app1/Main.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- |
-- Module : Main
-- Copyright : © 2018 bruno cuconato
-- License : LPGL-3
--
-- Maintainer : bruno cuconato <bcclaro+hackage@gmail.com>
-- Stability : experimental
-- Portability : non-portable
--
-- the @hs-conllu@ executable.

module Main
( main )
where

import Conllu.IO (readAndPrintConllu, diffConllu)

import System.Environment

main :: IO ()
-- | @validate@ : read CoNLL-U file and print it to stdout. (this will
-- only apply the command to one file, so use your terminal's
-- completion mechanism to apply it to several files.
main = do
(c:as) <- getArgs
case c of
"validate" -> mapM_ readAndPrintConllu as
"diff" -> diffConllu (as !! 0) (as !! 1)
_ -> return ()

File renamed without changes.
71 changes: 71 additions & 0 deletions package.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: hs-conllu
version: 0.1.6
synopsis: Conllu validating parser and utils.
description: utilities to parse, print, diff, and analyse data in CoNLL-U format.
homepage: https://github.com/odanoburu/hs-conllu
bug-reports: https://github.com/odanoburu/hs-conllu/issues
license: LGPL-3.0-only
license-file: LICENSE
author: bruno cuconato
maintainer: bruno cuconato <bcclaro+haskell+hsconllu@gmail.com>
copyright: 2021 bruno cuconato
build-type: Simple
extra-doc-files:
- README
- CHANGELOG
tested-with: ghc == 9.2.2

verbatim:
cabal-version: 2.0

ghc-options:
- -Wall
- -Wcompat
- -Widentities
- -Wincomplete-uni-patterns
- -Wincomplete-record-updates

default-extensions:
- OverloadedStrings
- LambdaCase
# - NoImplicitPrelude

dependencies:
- aeson
- base >= 4.15 && < 5
- bytestring
- containers
- directory
- filepath
- megaparsec
- mtl
- optparse-applicative
- scientific
- string-interpolate
- text
- text-conversions
- text-show
- unordered-containers
- vector

library:
source-dirs: src
other-modules: []
exposed-modules:
- Conllu.DeprelTagset
- Conllu.Diff
- Conllu.IO
- Conllu.Parse
- Conllu.Print
- Conllu.Test
- Conllu.Type
- Conllu.UposTagset
- Conllu.Utils

executables:
hs-conllu:
main: Main.hs
source-dirs: app1
dependencies:
- hs-conllu

108 changes: 106 additions & 2 deletions src/Conllu/DeprelTagset.hs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@
-- import qualified Conllu.DeprelTagset as D
-- @

module Conllu.DeprelTagset where
-- TODO create the UD Taxonomy Table and enhanced dependencies
-- < see https://universaldependencies.org/v2/conll-u.html>
-- TODO include modifier labels, e.g. passive verbs, relative cluses, etc.
-- TODO resolve deriving strategy

module Conllu.DeprelTagset
where
{- original declaration:
data EP
= REF -- ^ only allowed in DEPS
| ACL
Expand Down Expand Up @@ -54,4 +60,102 @@ data EP
| ROOT
| VOCATIVE
| XCOMP
deriving (Enum, Eq, Read, Show)
deriving ( Show, Eq ) -- , Read,Enum ) It is not clear why Enum and Eq are required
-}

-- Revised Universal Dependency Relations
-- TODO include subtyped relations and Structural Categories of the dependent
-- Universal Dependencies are represented lowercase
-- see https://universaldependencies.org/u/dep/index.html

data EP = -- DEPREL =
REF -- ^ only allowed in DEPS
| ACL -- clausal modifier of noun (adnominal clause)
-- | ACL RELCL -- relative clause modifier
| ADVCL -- adverbial clause modifier
| ADVMOD -- adverbial modifier
-- | ADVMOD EMPH -- emphasizing word, intensifier
-- | ADVMOD LMOD -- locative adverbial modifier
| AMOD -- adjectival modifier
| APPOS -- appositional modifier
| AUX -- auxiliary
-- | AUX PASS -- passive auxiliary
| CASE -- case marking
| CC -- coordinating conjunction
-- | CC PRECONJ -- preconjunct
| CCOMP -- clausal complement
| CLF -- classifier
| COMPOUND -- compound
-- | COMPOUND LVC -- light verb construction
-- | COMPOUND PRT -- phrasal verb particle
-- | COMPOUND REDUP -- reduplicated compounds
-- | COMPOUND SVC -- serial verb compounds
| CONJ -- conjunct
| COP -- copula
| CSUBJ -- clausal subject
-- | CSUBJ PASS -- clausal passive subject
| DEP -- unspecified dependency
| DET -- determiner
-- | DET NUMGOV -- pronominal quantifier governing the case of the noun
-- | DET NUMMOD -- pronominal quantifier agreeing in case with the noun
-- | DET POSS -- possessive determiner
| DISCOURSE -- discourse element
| DISLOCATED -- dislocated elements
| EXPL -- expletive
-- | EXPL IMPERS -- impersonal expletive
-- | EXPL PASS -- reflexive pronoun used in reflexive passive
-- | EXPL PV -- reflexive clitic with an inherently reflexive verb
| FIXED -- fixed multiword expression
| FLAT -- flat multiword expression
-- | FLAT FOREIGN -- foreign words
-- | FLAT NAME -- names
| GOESWITH -- goes with
| IOBJ -- indirect object
| LIST -- list
| MARK -- marker
| NMOD -- nominal modifier
-- | NMOD POSS -- possessive nominal modifier
-- | NMOD TMOD -- temporal modifier
| NSUBJ -- nominal subject
-- | NSUBJ PASS -- passive nominal subject
| NUMMOD -- numeric modifier
-- | NUMMOD GOV -- numeric modifier governing the case of the noun
| OBJ -- object
| OBL -- oblique nominal
-- | OBL AGENT -- agent modifier
-- | OBL ARG -- oblique argument
-- | OBL LMOD -- locative modifier
-- | OBL TMOD -- temporal modifier
| ORPHAN -- orphan
| PARATAXIS -- parataxis
| PUNCT -- punctuation
| REPARANDUM -- overridden disfluency
| ROOT -- root
| VOCATIVE -- vocative
| XCOMP -- open clausal complement
deriving ( Show, Eq )


-- Structural Categories of the dependent
{-
type StructuralCats = [Nominals , Clauses , ModifierWords , FunctionWords]
type Nominals = [Nsubj , Obj , Iobj , Obl , Vocative , Expl , Dislocated , Nmod , Appos , Nummod]
type Clauses = [Csubj , Ccomp , Xcomp , Advcl , Acl]
type ModifierWords = [Advmod , Discourse , Amod]
type FunctionWords = [Aux , Cop , Mark , Det , Clf , Case]

-- Functional Categories in relation to the head
type FunctionalCats = [ Core , NonCore , NominalDependent]
type Core = [Nsubj , Obj , Iobj , Csubj , Ccomp , Xcomp]
type NonCore = [Obl , Vocative , Expl , Dislocated , Advcl , Advmod , Discourse , Aux , Cop , Mark]
type NominalDependent = [Nmod , Appos , Nummod , Acl , Amod , Det , Clf , Case]

-- Relations that are not dependency relations in the narrow sense.
type Coordination = [ Conj , Cc]
type MWE = [ Fixed , Flat , Compound] -- MultiWordExpressions
type Loose = [ List , Parataxis]
type Special = [ Orphan , Goeswith , Reparandum]
type Other = [ Punct , Root , Dep]
-}


24 changes: 14 additions & 10 deletions src/Conllu/Diff.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-- |
{-# LANGUAGE OverloadedStrings #-}
-- -- |
-- Module : Conllu.Diff
-- Copyright : © 2018 bruno cuconato
-- License : LPGL-3
Expand All @@ -23,10 +24,11 @@ import Conllu.Type
import Conllu.Utils

import Data.Maybe
import Data.Ord
import Data.Ord (comparing)
import Data.Text (unpack)

---
-- * type synonims
-- * type synonyms
-- | CoNLL-U field diff.
type FDiff = StringPair

Expand All @@ -45,7 +47,7 @@ diffW = any isJust . printFieldDiffs

diffWs :: [CW a] -> [CW a] -> [WDiff a]
-- | filters the different word pairs.
diffWs ws1 ws2 = filter diffW $ zip ws1 ws2
diffWs ws1 ws2 = Prelude.filter diffW $ zip ws1 ws2

diffS :: (Sent, Sent) -> SDiff AW
-- | diffs the sentence pair's words.
Expand Down Expand Up @@ -79,8 +81,10 @@ sentId :: Sent -> Maybe Index
-- | try to find an index in a sentence's metadata looking for
-- 'sent_id = n'.
sentId s =
let mi = lookup "sent_id " $ _meta s
i = fromMaybe "0" mi
-- let myHashMap = fromList $ _meta s
-- mi = myHashMap !? "sent_id"
let mi = lookup "sent_id" $ _meta s
i = unpack $ fromMaybe "0" mi
in safeRead i :: Maybe Index

pairSents :: [Sent] -> [Sent] -> [(Sent, Sent)]
Expand All @@ -89,7 +93,7 @@ pairSents = pairSentsBy $ comparing sentId

---
-- * printing functions
printFieldDiffs :: WDiff a -> [Maybe StringPair]
printFieldDiffs :: WDiff a -> [Maybe (String, String)]
-- | list of maybe differing fields in a pair of words.
printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs
where
Expand All @@ -110,14 +114,14 @@ printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs
, showM . _misc
]

printWDiff :: WDiff a -> [StringPair]
printWDiff :: WDiff a -> [(String, String)]
-- | list of differing fields in a pair of words.
printWDiff = catMaybes . printFieldDiffs

printSDiff :: SDiff a -> [[StringPair]]
printSDiff :: SDiff a -> [[(String,String)]]
-- | list of differing words in a sentence.
printSDiff = fmap printWDiff

printDDiff :: DDiff a -> [[[StringPair]]]
printDDiff :: DDiff a -> [[[(String,String)]]]
-- | list of lists of differing words in sentences.
printDDiff = fmap printSDiff
35 changes: 23 additions & 12 deletions src/Conllu/IO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,30 @@
--
-- Defines major IO functions.

module Conllu.IO where
module Conllu.IO
(
readAndPrintConllu,
diffConllu,
readDirectory,
readConlluFile
)
where

---
-- imports
import Conllu.Type
import Conllu.Utils
import Conllu.Parse
import Conllu.Print
import Conllu.Diff
import Conllu.Type (Doc, Sent)
import Conllu.Utils (if')
import Conllu.Parse (Parser, sentence, parseConlluWith)
import Conllu.Print (printDoc)
import Conllu.Diff (diffSs, printDDiff)

import System.Directory
import System.FilePath
import System.Directory (listDirectory, doesFileExist, doesDirectoryExist) --, getDirectoryContents)
import System.FilePath ((</>)) -- Combine two paths with a path separator

-- import Control.Monad (forM) --GVF added for RWH getRecursiveContents at end
import Data.Functor (($>))
import qualified Data.Text.IO as TIO
-- import qualified Data.Text as T

-- * read functions

Expand All @@ -32,9 +43,9 @@ import System.FilePath
readConlluFileWith :: Parser Sent -> FilePath -> IO Doc
-- | reads a file with a customized parser.
readConlluFileWith p f = do
ds <- readFile f
ds <- TIO.readFile f
case parseConlluWith p f ds of
Left err -> putStr err *> return []
Left err -> putStr err $> []
Right d -> return d

readDirectoryWith :: Parser Sent -> FilePath -> IO [Doc]
Expand Down Expand Up @@ -70,14 +81,14 @@ readConllu = readConlluWith sentence
-- * write
writeConlluFile :: FilePath -> Doc -> IO ()
-- | writes a CoNLL-U file to disk.
writeConlluFile fp = writeFile fp . printDoc
writeConlluFile fp = TIO.writeFile fp . printDoc

---
-- * print
readAndPrintConllu :: FilePath -> IO ()
-- | reads and prints the CoNLL-U files given.
readAndPrintConllu fp = do
readConlluFile fp >>= putStr . printDoc
readConlluFile fp >>= TIO.putStrLn . printDoc
return ()

---
Expand Down
Loading