import qualified Data.ByteString.Lazy as L import Text.XML.Expat.SAX as Sax parse :: FilePath -> IO () parse path = do inputText <- L.readFile path let saxEvents = Sax.parse defaultParSEOptions inputText :: [SAXEvent Text Text] let txt = foldl' processEvent "" saxEvents putStrLn txt
在Cabal中激活分析后,它说parse.saxEvents占用了85%的已分配内存.我也使用了foldr,结果是一样的.
如果processEvent变得足够复杂,程序会因堆栈空间溢出错误而崩溃.
我究竟做错了什么?
解决方法
实际上,hexpat确实有’streaming’接口(就像xml-conduit).它使用了不太知名的List
库和the rather ugly List
class it defines.原则上,List包中的ListT
type应该运行良好.由于缺少组合器,我很快就放弃了,并为Pipes.ListT的包装版本编写了一个丑陋的List类的适当实例,然后我用它来导出普通的Pipes.Producer函数,如parseProduce.为此需要的微不足道的操作作为PipesSax.hs附加在下面
一旦我们有了parseProducer,我们就可以将ByteString或Text Producer转换为带有Text或ByteString组件的SaxEvents生产者.这是一些简单的操作.我使用的是238M“input.xml”;程序永远不需要超过6 MB的内存,从顶部来判断.
– Sax.hs大多数IO动作都使用在底部定义的registerIds管道,该管道是针对xml的巨大位而定制的,这是一个有效的1000片段http://sprunge.us/WaQK
{-#LANGUAGE OverloadedStrings #-} import PipesSax ( parseProducer ) import Data.ByteString ( ByteString ) import Text.XML.Expat.SAX import Pipes -- cabal install pipes pipes-bytestring import Pipes.ByteString (toHandle,fromHandle,stdin,stdout ) import qualified Pipes.Prelude as P import qualified System.IO as IO import qualified Data.ByteString.Char8 as Char8 sax :: Monadio m => Producer ByteString m () -> Producer (SAXEvent ByteString ByteString) m () sax = parseProducer defaultParSEOptions -- stream xml from stdin,yielding hexpat tagstream to stdout; main0 :: IO () main0 = runEffect $sax stdin >-> P.print -- stream the extracted 'IDs' from stdin to stdout main1 :: IO () main1 = runEffect $sax stdin >-> registryIds >-> stdout -- write all IDs to a file main2 = IO.withFile "input.xml" IO.ReadMode $\inp -> IO.withFile "output.txt" IO.WriteMode $\out -> runEffect $sax (fromHandle inp) >-> registryIds >-> toHandle out -- folds: -- print number of IDs main3 = IO.withFile "input.xml" IO.ReadMode $\inp -> do n <- P.length $sax (fromHandle inp) >-> registryIds print n -- sum the meaningful part of the IDs - a dumb fold for illustration main4 = IO.withFile "input.xml" IO.ReadMode $\inp -> do let pipeline = sax (fromHandle inp) >-> registryIds >-> P.map readIntId n <- P.fold (+) 0 id pipeline print n where readIntId :: ByteString -> Integer readIntId = maybe 0 (fromIntegral.fst) . Char8.readInt . Char8.drop 2 -- my xml has tags with attributes that appear via hexpat thus: -- StartElement "FacilitySite" [("registryId","110007915364")] -- and the like. This is just an arbitrary demo stream manipulation. registryIds :: Monad m => Pipe (SAXEvent ByteString ByteString) ByteString m () registryIds = do e <- await -- we look for a 'SAXEvent' case e of -- if it matches,we yield,else we go to the next event StartElement "FacilitySite" [("registryId",a)] -> do yield a yield "\n" registryIds _ -> registryIds
– ‘library’:PipesSax.hs
这只是newtypes Pipes.ListT来获取适当的实例.我们不导出任何与List或ListT有关的东西,只是使用标准的Pipes.Producer概念.
{-#LANGUAGE TypeFamilies,GeneralizednewtypeDeriving #-} module PipesSax (parseProducerLocations,parseProducer) where import Data.ByteString (ByteString) import Text.XML.Expat.SAX import Data.List.Class import Control.Monad import Control.applicative import Pipes import qualified Pipes.Internal as I parseProducer :: (Monad m,GenericXMLString tag,GenericXMLString text) => ParSEOptions tag text -> Producer ByteString m () -> Producer (SAXEvent tag text) m () parseProducer opt = enumerate . enumerate_ . parseG opt . Select_ . Select parseProducerLocations :: (Monad m,GenericXMLString text) => ParSEOptions tag text -> Producer ByteString m () -> Producer (SAXEvent tag text,XMLParseLocation) m () parseProducerLocations opt = enumerate . enumerate_ . parseLocationsG opt . Select_ . Select newtype ListT_ m a = Select_ { enumerate_ :: ListT m a } deriving (Functor,Monad,Monadplus,Monadio,applicative,Alternative,Monoid,MonadTrans) instance Monad m => List (ListT_ m) where type Itemm (ListT_ m) = m joinL = Select_ . Select . I.M . liftM (enumerate . enumerate_) runList = liftM emend . next . enumerate . enumerate_ where emend (Right (a,q)) = Cons a (Select_ (Select q)) emend _ = Nil
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。