Comparing cursor vs conduit xml parsing

131 views Asked by At

here's some xml i'm parsing -- it's my first time working with xml, soap, or conduit.

<?xml version  = "1.0" 
      encoding = "utf-8"?>
<soap:Envelope xmlns:soap = "http://schemas.xmlsoap.org/soap/envelope/" 
               xmlns:xsi  = "http://www.w3.org/2001/XMLSchema-instance" 
               xmlns:xsd  = "http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetListItemsResponse xmlns = "http://schemas.microsoft.com/sharepoint/soap/">
<GetListItemsResult>
<listitems xmlns:s  = 'uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882'
           xmlns:dt = 'uuid:C2F41010-65B3-11d1-A29F-00AA00C14882'
           xmlns:rs = 'urn:schemas-microsoft-com:rowset'
           xmlns:z  = '#RowsetSchema'>
<rs:data ItemCount = "290">
<z:row ows_Date              = '2020-10-20 00:00:00' 
       ows_Document          = 'https://www.oregon.gov/oha/PH/DISEASESCONDITIONS/DISEASESAZ/Emerging%20Respitory%20Infections/Oregon-COVID-19-Update-10-20-2020-FINAL.pdf, Oregon COVID-19 Daily Update 10.20.2020' 
       ows_Category          = 'Daily Update' 
       ows_MetaInfo          = '294;#' 
       ows__ModerationStatus = '0' 
       ows__Level            = '1' 
       ows_ID                = '294' 
       ows_UniqueId          = '294;#{C51D9DDB-9A9C-4C56-B030-236D6A0980D2}' 
       ows_owshiddenversion  = '1' 
       ows_FSObjType         = '294;#0' 
       ows_Created           = '2020-10-20 12:16:49' 
       ows_PermMask          = '0x1000030041' 
       ows_Modified          = '2020-10-20 12:16:49' 
       ows_FileRef           = '294;#oha/ERD/Lists/COVID19 Updates/294_.000' />
</rs:data>
</listitems>
</GetListItemsResult>
</GetListItemsResponse>
</soap:Body>
</soap:Envelope>

i only want to keep records where ows_Category is Weekly Report and ows_Document doesn't contain Spanish. i got the cursor version working easily. the conduit version was a lot trickier, but i eventually figured it out with the answer to this question.

even though both methods are working now, i have a few questions.

  • does the conduit approach have an equivalent for lax ignoring of namespaces?
  • what is making the concat necessary in the cursor function? looking at the types, we are starting with the root node, generating and maintaining a list of related nodes to consider, filtering them, maping over them, etc. what is making another layer of nesting, and why?
  • the conduit version needed helpers f (to call force everywhere) and ns (to namespace everything) -- they seemed so necessary that i would have thought the library would offer them as utilities because everyone would need them all the time. or am i doing something silly?
  • my worst sticking point was that i turned out to need the gliRspNS namespace on GetListItemsResult and listitems, even though in the xml it looks like it should only apply to GetListItemsResponse. it was only a lucky guess that got me past this. are namespaces supposed to inherit down until overridden like this?
  • regarding requireAttrRaw:
    • shouldn't we need to know the namespace if we're in charge of verifying Name?
    • why does requireAttrRaw send us [Content] instead of two Maybe Content, one each for ContentText and ContentEntity?
    • what are we supposed to do with ContentEntity "For pass-through parsing"?
{-# LANGUAGE OverloadedStrings #-}

import           Conduit
import           Control.Applicative
import           Control.Arrow
import           Control.Exception
import           Control.Monad
import qualified Data.ByteString.Lazy.Char8 as L8
import           Data.Foldable
import qualified Data.Map.Strict            as M
import           Data.String
import qualified Data.Text                  as T
import           Data.Time.Calendar
import           Data.Time.Format
import           Data.XML.Types
import qualified Text.XML                   as X
import           Text.XML.Cursor            hiding (force)
import           Text.XML.Stream.Parse

data Doc = Doc
  { url  :: String
  , name :: String
  , date :: Day
  } deriving (Show)

main :: IO ()
main = do
  r <- L8.readFile "oha.xml"

  let go :: Cursor -> [Doc]
      go c = concat $ -- what is making the layer of nesting that makes this necessary?  why?
        c $// laxElement  "row"
          >=> attributeIs "ows_Category" "Weekly Report"
          >=> checkElement (maybe False (not . T.isInfixOf "Spanish") . M.lookup "ows_Document" . X.elementAttributes)
          &|  \x -> doc <$> attribute "ows_Document" x <*> attribute "ows_Date" x

      doc x = Doc u v . parseTimeOrError True defaultTimeLocale "%Y-%-m-%-d" . takeWhile (/= ' ') . T.unpack
        where (u,v) = second (drop 2) . break (== ',') $ T.unpack x

      parseAttributes, parseAttributes' :: AttrParser (T.Text, T.Text)
      parseAttributes' = do
        doc  <- requireAttr "ows_Document"
        cat  <- requireAttr "ows_Category"
        date <- requireAttr "ows_Date"
        ignoreAttrs
        guard $ not (T.isInfixOf "Spanish" doc) && cat == "Weekly Report"
        return (doc, date)

      -- since the attribute values don't interact, we can parse in Applicative rather than Monad
      parseAttributes = (,) <$> requireAttrRaw' "ows_Document" (not . T.isInfixOf "Spanish")
                            <*> requireAttr     "ows_Date"
                            <*  requireAttrRaw' "ows_Category" ("Weekly Report" ==)
                            <*  ignoreAttrs
        where requireAttrRaw' n f = requireAttrRaw ("required attr value failed condition: " <> n) $ \(n',as) ->
                  asum $ (\(ContentText a) -> guard (n' == fromString n && f a) *> pure a) <$> as
                -- shouldn't we have had to pass in namespace?
                -- why [Content] instead of two Maybe Content, one for ContentText and other for ContentEntity?
                -- what to do with ContentEntity Text "For pass-through parsing"?}

      ns n = fromString . (("{" <> n <> "}") <>)
      f g n s = force (s <> " required") . g (ns n s)

      parseDocs :: (MonadThrow m, MonadIO m) => ConduitT Event o m [Doc]
      parseDocs = f tagNoAttr      soapNS   "Envelope"
                . f tagNoAttr      soapNS   "Body"
                . f tagNoAttr      gliRspNS "GetListItemsResponse"
                . f tagNoAttr      gliRspNS "GetListItemsResult" -- didn't expect to need ns gliRspNS here
                . f tagNoAttr      gliRspNS "listitems"          -- didn't expect to need ns gliRspNS here
                . f tagIgnoreAttrs rsNS     "data"
                . many' . tag' (ns zNS      "row")
                               parseAttributes $ return . uncurry doc

      soapNS   = "http://schemas.xmlsoap.org/soap/envelope/"
      gliRspNS = "http://schemas.microsoft.com/sharepoint/soap/"
      rsNS     = "urn:schemas-microsoft-com:rowset"
      zNS      = "#RowsetSchema"

      disp = (print . length) <=< traverse print

  (throwIO ||| disp . go . fromDocument) $ X.parseLBS X.def r
  (            disp =<<) . runConduit    $   parseLBS   def r .| parseDocs

lastly, i normally get the xml from Network.HTTP.Simple.httpLBS instead of reading from a file. am i right that there's a way to wire the conduit parser up to Network.HTTP.Client.Conduit.httpLBS so that it operates on the stream directly?

0

There are 0 answers