diff options
author | Diego Nehab <diego@tecgraf.puc-rio.br> | 2007-05-31 21:23:42 +0000 |
---|---|---|
committer | Diego Nehab <diego@tecgraf.puc-rio.br> | 2007-05-31 21:23:42 +0000 |
commit | 7b195164b0c8755b15e8055f1d524282847f6e13 (patch) | |
tree | 0992aaf2ae0fd41202e4953512a7d806ae101df4 | |
parent | 37f266ceea2d257e58734a83addf6be3927114a3 (diff) | |
download | luasocket-7b195164b0c8755b15e8055f1d524282847f6e13.tar.gz luasocket-7b195164b0c8755b15e8055f1d524282847f6e13.tar.bz2 luasocket-7b195164b0c8755b15e8055f1d524282847f6e13.zip |
Lua Gem.
-rw-r--r-- | gem/ltn012.tex | 678 | ||||
-rw-r--r-- | gem/makefile | 14 | ||||
-rwxr-xr-x | gem/myps2pdf | 113 |
3 files changed, 805 insertions, 0 deletions
diff --git a/gem/ltn012.tex b/gem/ltn012.tex new file mode 100644 index 0000000..7dbc5ef --- /dev/null +++ b/gem/ltn012.tex | |||
@@ -0,0 +1,678 @@ | |||
1 | \documentclass[10pt]{article} | ||
2 | \usepackage{fancyvrb} | ||
3 | \usepackage{url} | ||
4 | \DefineVerbatimEnvironment{lua}{Verbatim}{fontsize=\small,commandchars=\@\#\%} | ||
5 | \DefineVerbatimEnvironment{C}{Verbatim}{fontsize=\small,commandchars=\@\#\%} | ||
6 | \DefineVerbatimEnvironment{mime}{Verbatim}{fontsize=\small,commandchars=\$\#\%} | ||
7 | \newcommand{\stick}[1]{\vbox{\setlength{\parskip}{0pt}#1}} | ||
8 | \newcommand{\bl}{\ensuremath{\mathtt{\backslash}}} | ||
9 | |||
10 | |||
11 | \title{Filters, sources, sinks, and pumps\\ | ||
12 | {\large or Functional programming for the rest of us}} | ||
13 | \author{Diego Nehab} | ||
14 | |||
15 | \begin{document} | ||
16 | |||
17 | \maketitle | ||
18 | |||
19 | \begin{abstract} | ||
20 | Certain data processing operations can be implemented in the | ||
21 | form of filters. A filter is a function that can process data | ||
22 | received in consecutive function calls, returning partial | ||
23 | results after each invocation. Examples of operations that can be | ||
24 | implemented as filters include the end-of-line normalization | ||
25 | for text, Base64 and Quoted-Printable transfer content | ||
26 | encodings, the breaking of text into lines, SMTP byte | ||
27 | stuffing, and there are many others. Filters become even | ||
28 | more powerful when we allow them to be chained together to | ||
29 | create composite filters. In this context, filters can be seen | ||
30 | as the middle links in a chain of data transformations. Sources an sinks | ||
31 | are the corresponding end points of these chains. A source | ||
32 | is a function that produces data, chunk by chunk, and a sink | ||
33 | is a function that takes data, chunk by chunk. In this | ||
34 | chapter, we describe the design of an elegant interface for filters, | ||
35 | sources, sinks and chaining, refine it | ||
36 | until it reaches a high degree of generality. We discuss | ||
37 | implementation challenges, provide practical solutions, | ||
38 | and illustrate each step with concrete examples. | ||
39 | \end{abstract} | ||
40 | |||
41 | |||
42 | \section{Introduction} | ||
43 | |||
44 | Within the realm of networking applications, we are often | ||
45 | required apply transformations to streams of data. Examples | ||
46 | include the end-of-line normalization for text, Base64 and | ||
47 | Quoted-Printable transfer content encodings, breaking text | ||
48 | into lines with a maximum number of columns, SMTP | ||
49 | dot-stuffing, \texttt{gzip} compression, HTTP chunked | ||
50 | transfer coding, and the list goes on. | ||
51 | |||
52 | Many complex tasks require a combination of two or more such | ||
53 | transformations, and therefore a general mechanism for | ||
54 | promoting reuse is desirable. In the process of designing | ||
55 | LuaSocket 2.0, David Burgess and I were forced to deal with | ||
56 | this problem. The solution we reached proved to be very | ||
57 | general and convenient. It is based on the concepts of | ||
58 | filters, sources, sinks, and pumps, which we introduce | ||
59 | below. | ||
60 | |||
61 | \emph{Filters} are functions that can be repeatedly invoked | ||
62 | with chunks of input, successively returning processed | ||
63 | chunks of output. More importantly, the result of | ||
64 | concatenating all the output chunks must be the same as the | ||
65 | result of applying the filter over the concatenation of all | ||
66 | input chunks. In fancier language, filters \emph{commute} | ||
67 | with the concatenation operator. As a result, chunk | ||
68 | boundaries are irrelevant: filters correctly handle input | ||
69 | data no matter how it was originally split. | ||
70 | |||
71 | A \emph{chain} transparently combines the effect of one or | ||
72 | more filters. The interface of a chain must be | ||
73 | indistinguishable from the interface of its components. | ||
74 | This allows a chained filter to be used wherever an atomic | ||
75 | filter is expected. In particular, chains can be chained | ||
76 | themselves to create arbitrarily complex operations. | ||
77 | |||
78 | Filters can be seen as internal nodes in a network through | ||
79 | which data will flow, potentially being transformed many | ||
80 | times along its way. Chains connect these nodes together. | ||
81 | To complete the picture, we need \emph{sources} and | ||
82 | \emph{sinks}. These are the initial and final nodes of the | ||
83 | network, respectively. Less abstractly, a source is a | ||
84 | function that produces new data every time it is called. | ||
85 | Conversely, sinks are functions that give a final | ||
86 | destination to the data they receive. Naturally, sources | ||
87 | and sinks can also be chained with filters to produce | ||
88 | filtered sources and sinks. | ||
89 | |||
90 | Finally, filters, chains, sources, and sinks are all passive | ||
91 | entities: they must be repeatedly invoked in order for | ||
92 | anything to happen. \emph{Pumps} provide the driving force | ||
93 | that pushes data through the network, from a source to a | ||
94 | sink. | ||
95 | |||
96 | These concepts will become less abstract with examples. In | ||
97 | the following sections, we start with a simplified | ||
98 | interface, which we refine several times until no obvious | ||
99 | shortcomings remain. The evolution we present is not | ||
100 | contrived: it recreates the steps we followed ourselves as | ||
101 | we consolidated our understanding of these concepts and the | ||
102 | applications that benefit from them. | ||
103 | |||
104 | \subsection{A concrete example} | ||
105 | |||
106 | Let us use the end-of-line normalization of text as an | ||
107 | example to motivate our initial filter interface. | ||
108 | Assume we are given text in an unknown end-of-line | ||
109 | convention (including possibly mixed conventions) out of the | ||
110 | commonly found Unix (LF), Mac OS (CR), and DOS (CRLF) | ||
111 | conventions. We would like to be able to write code like the | ||
112 | following: | ||
113 | \begin{quote} | ||
114 | \begin{lua} | ||
115 | @stick# | ||
116 | local in = source.chain(source.file(io.stdin), normalize("\r\n")) | ||
117 | local out = sink.file(io.stdout) | ||
118 | pump.all(in, out) | ||
119 | % | ||
120 | \end{lua} | ||
121 | \end{quote} | ||
122 | |||
123 | This program should read data from the standard input stream | ||
124 | and normalize the end-of-line markers to the canonic CRLF | ||
125 | marker, as defined by the MIME standard. Finally, the | ||
126 | normalized text should be sent to the standard output | ||
127 | stream. We use a \emph{file source} that produces data from | ||
128 | standard input, and chain it with a filter that normalizes | ||
129 | the data. The pump then repeatedly obtains data from the | ||
130 | source, and passes it to the \emph{file sink}, which sends | ||
131 | it to the standard output. | ||
132 | |||
133 | In the code above, the \texttt{normalize} \emph{factory} is a | ||
134 | function that creates our normalization filter. This filter | ||
135 | will replace any end-of-line marker with the canonic | ||
136 | `\verb|\r\n|' marker. The initial filter interface is | ||
137 | trivial: a filter function receives a chunk of input data, | ||
138 | and returns a chunk of processed data. When there are no | ||
139 | more input data left, the caller notifies the filter by invoking | ||
140 | it with a \texttt{nil} chunk. The filter responds by returning | ||
141 | the final chunk of processed data. | ||
142 | |||
143 | Although the interface is extremely simple, the | ||
144 | implementation is not so obvious. Any filter | ||
145 | respecting this interface needs to keep some kind of context | ||
146 | between calls. This is because chunks can for example be broken | ||
147 | between the CR and LF characters marking the end of a line. This | ||
148 | need for contextual storage is what motivates the use of | ||
149 | factories: each time the factory is called, it returns a | ||
150 | filter with its own context so that we can have several | ||
151 | independent filters being used at the same time. For | ||
152 | efficiency reasons, we must avoid the obvious solution of | ||
153 | concatenating all the input into the context before | ||
154 | producing any output. | ||
155 | |||
156 | To that end, we will break the implementation in two parts: | ||
157 | a low-level filter, and a factory of high-level filters. The | ||
158 | low-level filter will be implemented in C and will not carry | ||
159 | any context between function calls. The high-level filter | ||
160 | factory, implemented in Lua, will create and return a | ||
161 | high-level filter that maintains whatever context the low-level | ||
162 | filter needs, but isolates the user from its internal | ||
163 | details. That way, we take advantage of C's efficiency to | ||
164 | perform the hard work, and take advantage of Lua's | ||
165 | simplicity for the bookkeeping. | ||
166 | |||
167 | \subsection{The Lua part of the filter} | ||
168 | |||
169 | Below is the complete implementation of the factory of high-level | ||
170 | end-of-line normalization filters: | ||
171 | \begin{quote} | ||
172 | \begin{lua} | ||
173 | @stick# | ||
174 | function filter.cycle(low, ctx, extra) | ||
175 | return function(chunk) | ||
176 | local ret | ||
177 | ret, ctx = low(ctx, chunk, extra) | ||
178 | return ret | ||
179 | end | ||
180 | end | ||
181 | % | ||
182 | |||
183 | @stick# | ||
184 | function normalize(marker) | ||
185 | return cycle(eol, 0, marker) | ||
186 | end | ||
187 | % | ||
188 | \end{lua} | ||
189 | \end{quote} | ||
190 | |||
191 | The \texttt{normalize} factory simply calls a more generic | ||
192 | factory, the \texttt{cycle} factory. This factory receives a | ||
193 | low-level filter, an initial context, and an extra | ||
194 | parameter, and returns the corresponding high-level filter. | ||
195 | Each time the high-level filer is passed a new chunk, it | ||
196 | invokes the low-level filter passing it the previous | ||
197 | context, the new chunk, and the extra argument. The | ||
198 | low-level filter in turn produces the chunk of processed | ||
199 | data and a new context. The high-level filter then updates | ||
200 | its internal context, and returns the processed chunk of | ||
201 | data to the user. It is the low-level filter that does all | ||
202 | the work. Notice that we take advantage of Lua's lexical | ||
203 | scoping to store the context in a closure between function | ||
204 | calls. | ||
205 | |||
206 | Concerning the low-level filter code, we must first accept | ||
207 | that there is no perfect solution to the end-of-line marker | ||
208 | normalization problem itself. The difficulty comes from an | ||
209 | inherent ambiguity on the definition of empty lines within | ||
210 | mixed input. However, the following solution works well for | ||
211 | any consistent input, as well as for non-empty lines in | ||
212 | mixed input. It also does a reasonable job with empty lines | ||
213 | and serves as a good example of how to implement a low-level | ||
214 | filter. | ||
215 | |||
216 | The idea is to consider both CR and~LF as end-of-line | ||
217 | \emph{candidates}. We issue a single break if any candidate | ||
218 | is seen alone, or followed by a different candidate. In | ||
219 | other words, CR~CR~and LF~LF each issue two end-of-line | ||
220 | markers, whereas CR~LF~and LF~CR issue only one marker each. | ||
221 | This idea correctly handles the Unix, DOS/MIME, VMS, and Mac | ||
222 | OS, as well as other more obscure conventions. | ||
223 | |||
224 | \subsection{The C part of the filter} | ||
225 | |||
226 | Our low-level filter is divided into two simple functions. | ||
227 | The inner function actually does the conversion. It takes | ||
228 | each input character in turn, deciding what to output and | ||
229 | how to modify the context. The context tells if the last | ||
230 | character processed was an end-of-line candidate, and if so, | ||
231 | which candidate it was. | ||
232 | \begin{quote} | ||
233 | \begin{C} | ||
234 | @stick# | ||
235 | @#define candidate(c) (c == CR || c == LF) | ||
236 | static int process(int c, int last, const char *marker, | ||
237 | luaL_Buffer *buffer) { | ||
238 | if (candidate(c)) { | ||
239 | if (candidate(last)) { | ||
240 | if (c == last) luaL_addstring(buffer, marker); | ||
241 | return 0; | ||
242 | } else { | ||
243 | luaL_addstring(buffer, marker); | ||
244 | return c; | ||
245 | } | ||
246 | } else { | ||
247 | luaL_putchar(buffer, c); | ||
248 | return 0; | ||
249 | } | ||
250 | } | ||
251 | % | ||
252 | \end{C} | ||
253 | \end{quote} | ||
254 | |||
255 | The inner function makes use of Lua's auxiliary library's | ||
256 | buffer interface for efficiency. The | ||
257 | outer function simply interfaces with Lua. It receives the | ||
258 | context and the input chunk (as well as an optional | ||
259 | custom end-of-line marker), and returns the transformed | ||
260 | output chunk and the new context. | ||
261 | \begin{quote} | ||
262 | \begin{C} | ||
263 | @stick# | ||
264 | static int eol(lua_State *L) { | ||
265 | int ctx = luaL_checkint(L, 1); | ||
266 | size_t isize = 0; | ||
267 | const char *input = luaL_optlstring(L, 2, NULL, &isize); | ||
268 | const char *last = input + isize; | ||
269 | const char *marker = luaL_optstring(L, 3, CRLF); | ||
270 | luaL_Buffer buffer; | ||
271 | luaL_buffinit(L, &buffer); | ||
272 | if (!input) { | ||
273 | lua_pushnil(L); | ||
274 | lua_pushnumber(L, 0); | ||
275 | return 2; | ||
276 | } | ||
277 | while (input < last) | ||
278 | ctx = process(*input++, ctx, marker, &buffer); | ||
279 | luaL_pushresult(&buffer); | ||
280 | lua_pushnumber(L, ctx); | ||
281 | return 2; | ||
282 | } | ||
283 | % | ||
284 | \end{C} | ||
285 | \end{quote} | ||
286 | |||
287 | Notice that if the input chunk is \texttt{nil}, the operation | ||
288 | is considered to be finished. In that case, the loop will | ||
289 | not execute a single time and the context is reset to the | ||
290 | initial state. This allows the filter to be reused many | ||
291 | times. | ||
292 | |||
293 | When designing your own filters, the challenging part is to | ||
294 | decide what will be the context. For line breaking, for | ||
295 | instance, it could be the number of bytes left in the | ||
296 | current line. For Base64 encoding, it could be a string | ||
297 | with the bytes that remain after the division of the input | ||
298 | into 3-byte atoms. The MIME module in the LuaSocket | ||
299 | distribution has many other examples. | ||
300 | |||
301 | \section{Filter chains} | ||
302 | |||
303 | Chains add a lot to the power of filters. For example, | ||
304 | according to the standard for Quoted-Printable encoding, the | ||
305 | text must be normalized into its canonic form prior to | ||
306 | encoding, as far as end-of-line markers are concerned. To | ||
307 | help specifying complex transformations like these, we define a | ||
308 | chain factory that creates a composite filter from one or | ||
309 | more filters. A chained filter passes data through all | ||
310 | its components, and can be used wherever a primitive filter | ||
311 | is accepted. | ||
312 | |||
313 | The chaining factory is very simple. All it does is return a | ||
314 | function that passes data through all filters and returns | ||
315 | the result to the user. The auxiliary | ||
316 | function~\texttt{chainpair} can only chain two filters | ||
317 | together. In the auxiliary function, special care must be | ||
318 | taken if the chunk is the last. This is because the final | ||
319 | \texttt{nil} chunk notification has to be pushed through both | ||
320 | filters in turn: | ||
321 | \begin{quote} | ||
322 | \begin{lua} | ||
323 | @stick# | ||
324 | local function chainpair(f1, f2) | ||
325 | return function(chunk) | ||
326 | local ret = f2(f1(chunk)) | ||
327 | if chunk then return ret | ||
328 | else return ret .. f2() end | ||
329 | end | ||
330 | end | ||
331 | % | ||
332 | |||
333 | @stick# | ||
334 | function filter.chain(...) | ||
335 | local f = arg[1] | ||
336 | for i = 2, table.getn(arg) do | ||
337 | f = chainpair(f, arg[i]) | ||
338 | end | ||
339 | return f | ||
340 | end | ||
341 | % | ||
342 | \end{lua} | ||
343 | \end{quote} | ||
344 | |||
345 | Thanks to the chain factory, we can | ||
346 | trivially define the Quoted-Printable conversion: | ||
347 | \begin{quote} | ||
348 | \begin{lua} | ||
349 | @stick# | ||
350 | local qp = filter.chain(normalize("\r\n"), | ||
351 | encode("quoted-printable")) | ||
352 | local in = source.chain(source.file(io.stdin), qp) | ||
353 | local out = sink.file(io.stdout) | ||
354 | pump.all(in, out) | ||
355 | % | ||
356 | \end{lua} | ||
357 | \end{quote} | ||
358 | |||
359 | \section{Sources, sinks, and pumps} | ||
360 | |||
361 | The filters we introduced so far act as the internal nodes | ||
362 | in a network of transformations. Information flows from node | ||
363 | to node (or rather from one filter to the next) and is | ||
364 | transformed on its way out. Chaining filters together is our | ||
365 | way to connect nodes in this network. As the starting point | ||
366 | for the network, we need a source node that produces the | ||
367 | data. In the end of the network, we need a sink node that | ||
368 | gives a final destination to the data. | ||
369 | |||
370 | \subsection{Sources} | ||
371 | |||
372 | A source returns the next chunk of data each time it is | ||
373 | invoked. When there is no more data, it simply returns | ||
374 | \texttt{nil}. In the event of an error, the source can inform the | ||
375 | caller by returning \texttt{nil} followed by an error message. | ||
376 | |||
377 | Below are two simple source factories. The \texttt{empty} source | ||
378 | returns no data, possibly returning an associated error | ||
379 | message. The \texttt{file} source is more usefule, and | ||
380 | yields the contents of a file in a chunk by chunk fashion. | ||
381 | \begin{quote} | ||
382 | \begin{lua} | ||
383 | @stick# | ||
384 | function source.empty(err) | ||
385 | return function() | ||
386 | return nil, err | ||
387 | end | ||
388 | end | ||
389 | % | ||
390 | |||
391 | @stick# | ||
392 | function source.file(handle, io_err) | ||
393 | if handle then | ||
394 | return function() | ||
395 | local chunk = handle:read(2048) | ||
396 | if not chunk then handle:close() end | ||
397 | return chunk | ||
398 | end | ||
399 | else return source.empty(io_err or "unable to open file") end | ||
400 | end | ||
401 | % | ||
402 | \end{lua} | ||
403 | \end{quote} | ||
404 | |||
405 | \subsection{Filtered sources} | ||
406 | |||
407 | It is often useful to chain a source with a filter. A | ||
408 | filtered source passes its data through the | ||
409 | associated filter before returning it to the caller. | ||
410 | Here is a factory that does the job: | ||
411 | \begin{quote} | ||
412 | \begin{lua} | ||
413 | @stick# | ||
414 | function source.chain(src, f) | ||
415 | return source.simplify(function() | ||
416 | if not src then return nil end | ||
417 | local chunk, err = src() | ||
418 | if not chunk then | ||
419 | src = nil | ||
420 | return f(nil) | ||
421 | else return f(chunk) end | ||
422 | end) | ||
423 | end | ||
424 | % | ||
425 | \end{lua} | ||
426 | \end{quote} | ||
427 | |||
428 | Our motivating example in the introduction chains a source | ||
429 | with a filter. Filtered sources are useful when working with | ||
430 | functions that get their input data from a source (such as | ||
431 | the pump in the example). By chaining a source with one or | ||
432 | more filters, the function can be transparently provided | ||
433 | with filtered data, with no need to change its interface. | ||
434 | |||
435 | \subsection{Sinks} | ||
436 | |||
437 | Just as we defined an interface for sources of | ||
438 | data, we can also define an interface for a | ||
439 | destination for data. We call any function respecting this | ||
440 | interface a \emph{sink}. In our first example, we used a | ||
441 | file sink connected to the standard output. | ||
442 | |||
443 | Sinks receive consecutive chunks of data, until the end of | ||
444 | data is notified with a \texttt{nil} chunk. A sink can be | ||
445 | notified of an error with an optional extra argument that | ||
446 | contains the error message, following a \texttt{nil} chunk. | ||
447 | If a sink detects an error itself, and | ||
448 | wishes not to be called again, it can return \texttt{nil}, | ||
449 | followed by an error message. A return value that | ||
450 | is not \texttt{nil} means the source will accept more data. | ||
451 | |||
452 | Below are two useful sink factories. | ||
453 | The table factory creates a sink that stores | ||
454 | individual chunks into an array. The data can later be | ||
455 | efficiently concatenated into a single string with Lua's | ||
456 | \texttt{table.concat} library function. The \texttt{null} sink | ||
457 | simply discards the chunks it receives: | ||
458 | \begin{quote} | ||
459 | \begin{lua} | ||
460 | @stick# | ||
461 | function sink.table(t) | ||
462 | t = t or {} | ||
463 | local f = function(chunk, err) | ||
464 | if chunk then table.insert(t, chunk) end | ||
465 | return 1 | ||
466 | end | ||
467 | return f, t | ||
468 | end | ||
469 | % | ||
470 | |||
471 | @stick# | ||
472 | local function null() | ||
473 | return 1 | ||
474 | end | ||
475 | |||
476 | function sink.null() | ||
477 | return null | ||
478 | end | ||
479 | % | ||
480 | \end{lua} | ||
481 | \end{quote} | ||
482 | |||
483 | Naturally, filtered sinks are just as useful as filtered | ||
484 | sources. A filtered sink passes each chunk it receives | ||
485 | through the associated filter before handing it to the | ||
486 | original sink. In the following example, we use a source | ||
487 | that reads from the standard input. The input chunks are | ||
488 | sent to a table sink, which has been coupled with a | ||
489 | normalization filter. The filtered chunks are then | ||
490 | concatenated from the output array, and finally sent to | ||
491 | standard out: | ||
492 | \begin{quote} | ||
493 | \begin{lua} | ||
494 | @stick# | ||
495 | local in = source.file(io.stdin) | ||
496 | local out, t = sink.table() | ||
497 | out = sink.chain(normalize("\r\n"), out) | ||
498 | pump.all(in, out) | ||
499 | io.write(table.concat(t)) | ||
500 | % | ||
501 | \end{lua} | ||
502 | \end{quote} | ||
503 | |||
504 | \subsection{Pumps} | ||
505 | |||
506 | Adrian Sietsma noticed that, although not on purpose, our | ||
507 | interface for sources is compatible with Lua iterators. | ||
508 | That is, a source can be neatly used in conjunction | ||
509 | with \texttt{for} loops. Using our file | ||
510 | source as an iterator, we can write the following code: | ||
511 | \begin{quote} | ||
512 | \begin{lua} | ||
513 | @stick# | ||
514 | for chunk in source.file(io.stdin) do | ||
515 | io.write(chunk) | ||
516 | end | ||
517 | % | ||
518 | \end{lua} | ||
519 | \end{quote} | ||
520 | |||
521 | Loops like this will always be present because everything | ||
522 | we designed so far is passive. Sources, sinks, filters: none | ||
523 | of them can do anything on their own. The operation of | ||
524 | pumping all data a source can provide into a sink is so | ||
525 | common that it deserves its own function: | ||
526 | \begin{quote} | ||
527 | \begin{lua} | ||
528 | @stick# | ||
529 | function pump.step(src, snk) | ||
530 | local chunk, src_err = src() | ||
531 | local ret, snk_err = snk(chunk, src_err) | ||
532 | return chunk and ret and not src_err and not snk_err, | ||
533 | src_err or snk_err | ||
534 | end | ||
535 | % | ||
536 | |||
537 | @stick# | ||
538 | function pump.all(src, snk, step) | ||
539 | step = step or pump.step | ||
540 | while true do | ||
541 | local ret, err = step(src, snk) | ||
542 | if not ret then return not err, err end | ||
543 | end | ||
544 | end | ||
545 | % | ||
546 | \end{lua} | ||
547 | \end{quote} | ||
548 | |||
549 | The \texttt{pump.step} function moves one chunk of data from | ||
550 | the source to the sink. The \texttt{pump.all} function takes | ||
551 | an optional \texttt{step} function and uses it to pump all the | ||
552 | data from the source to the sink. We can now use everything | ||
553 | we have to write a program that reads a binary file from | ||
554 | disk and stores it in another file, after encoding it to the | ||
555 | Base64 transfer content encoding: | ||
556 | \begin{quote} | ||
557 | \begin{lua} | ||
558 | @stick# | ||
559 | local in = source.chain( | ||
560 | source.file(io.open("input.bin", "rb")), | ||
561 | encode("base64")) | ||
562 | local out = sink.chain( | ||
563 | wrap(76), | ||
564 | sink.file(io.open("output.b64", "w"))) | ||
565 | pump.all(in, out) | ||
566 | % | ||
567 | \end{lua} | ||
568 | \end{quote} | ||
569 | |||
570 | The way we split the filters here is not intuitive, on | ||
571 | purpose. Alternatively, we could have chained the Base64 | ||
572 | encode filter and the line-wrap filter together, and then | ||
573 | chain the resulting filter with either the file source or | ||
574 | the file sink. It doesn't really matter. | ||
575 | |||
576 | \section{Exploding filters} | ||
577 | |||
578 | Our current filter interface has one flagrant shortcoming. | ||
579 | When David Burgess was writing his \texttt{gzip} filter, he | ||
580 | noticed that a decompression filter can explode a small | ||
581 | input chunk into a huge amount of data. To address this, we | ||
582 | decided to change our filter interface to allow exploding | ||
583 | filters to return large quantities of output data in a chunk | ||
584 | by chunk manner. | ||
585 | |||
586 | More specifically, after passing each chunk of input data to | ||
587 | a filter and collecting the first chunk of output data, the | ||
588 | user must now loop to receive data from the filter until no | ||
589 | filtered data is left. Within these secondary calls, the | ||
590 | caller passes an empty string to the filter. The filter | ||
591 | responds with an empty string when it is ready for the next | ||
592 | input chunk. In the end, after the user passes a | ||
593 | \texttt{nil} chunk notifying the filter that there is no | ||
594 | more input data, the filter might still have to produce too | ||
595 | much output data to return in a single chunk. The user has | ||
596 | to loop again, this time passing \texttt{nil} each time, | ||
597 | until the filter itself returns \texttt{nil} to notify the | ||
598 | user it is finally done. | ||
599 | |||
600 | Fortunately, it is very easy to modify a filter to respect | ||
601 | the new interface. In fact, the end-of-line translation | ||
602 | filter we presented earlier already conforms to it. The | ||
603 | complexity is encapsulated within the chaining functions, | ||
604 | which must now include a loop. Since these functions only | ||
605 | have to be written once, the user is not affected. | ||
606 | Interestingly, the modifications do not have a measurable | ||
607 | negative impact in the the performance of filters that do | ||
608 | not need the added flexibility. On the other hand, for a | ||
609 | small price in complexity, the changes make exploding | ||
610 | filters practical. | ||
611 | |||
612 | \section{A complex example} | ||
613 | |||
614 | The LTN12 module in the \texttt{LuaSocket} distribution | ||
615 | implements the ideas we have described. The MIME | ||
616 | and SMTP modules are especially integrated with LTN12, | ||
617 | and can be used to showcase the expressive power of filters, | ||
618 | sources, sinks, and pumps. Below is an example | ||
619 | of how a user would proceed to define and send a | ||
620 | multipart message with attachments, using \texttt{LuaSocket}: | ||
621 | \begin{quote} | ||
622 | \begin{mime} | ||
623 | local smtp = require"socket.smtp" | ||
624 | local mime = require"mime" | ||
625 | local ltn12 = require"ltn12" | ||
626 | |||
627 | local message = smtp.message{ | ||
628 | headers = { | ||
629 | from = "Sicrano <sicrano@example.com>", | ||
630 | to = "Fulano <fulano@example.com>", | ||
631 | subject = "A message with an attachment"}, | ||
632 | body = { | ||
633 | preamble = "Hope you can see the attachment\r\n", | ||
634 | [1] = { | ||
635 | body = "Here is our logo\r\n"}, | ||
636 | [2] = { | ||
637 | headers = { | ||
638 | ["content-type"] = 'image/png; name="luasocket.png"', | ||
639 | ["content-disposition"] = | ||
640 | 'attachment; filename="luasocket.png"', | ||
641 | ["content-description"] = 'LuaSocket logo', | ||
642 | ["content-transfer-encoding"] = "BASE64"}, | ||
643 | body = ltn12.source.chain( | ||
644 | ltn12.source.file(io.open("luasocket.png", "rb")), | ||
645 | ltn12.filter.chain( | ||
646 | mime.encode("base64"), | ||
647 | mime.wrap()))}}} | ||
648 | |||
649 | assert(smtp.send{ | ||
650 | rcpt = "<fulano@example.com>", | ||
651 | from = "<sicrano@example.com>", | ||
652 | source = message}) | ||
653 | \end{mime} | ||
654 | \end{quote} | ||
655 | |||
656 | The \texttt{smtp.message} function receives a table | ||
657 | describing the message, and returns a source. The | ||
658 | \texttt{smtp.send} function takes this source, chains it with the | ||
659 | SMTP dot-stuffing filter, creates a connects a socket sink | ||
660 | to the server, and simply pumps the data. The message is never | ||
661 | assembled in memory. Everything is produced on demand, | ||
662 | transformed in small pieces, and sent to the server in chunks, | ||
663 | including the file attachment that is loaded from disk and | ||
664 | encoded on the fly. It just works. | ||
665 | |||
666 | \section{Conclusions} | ||
667 | |||
668 | In this article we introduce the concepts of filters, | ||
669 | sources, sinks, and pumps to the Lua language. These are | ||
670 | useful tools for data processing in general. Sources provide | ||
671 | a simple abstraction for data acquisition. Sinks provide an | ||
672 | abstraction for final data destinations. Filters define an | ||
673 | interface for data transformations. The chaining of | ||
674 | filters, sources and sinks provides an elegant way to create | ||
675 | arbitrarily complex data transformation from simpler | ||
676 | transformations. Pumps simply move the data through. | ||
677 | |||
678 | \end{document} | ||
diff --git a/gem/makefile b/gem/makefile new file mode 100644 index 0000000..a4287c2 --- /dev/null +++ b/gem/makefile | |||
@@ -0,0 +1,14 @@ | |||
1 | ltn012.pdf: ltn012.ps | ||
2 | ./myps2pdf ltn012.ps | ||
3 | |||
4 | ltn012.ps: ltn012.dvi | ||
5 | dvips -G0 -t letter -o ltn012.ps ltn012.dvi | ||
6 | |||
7 | ltn012.dvi: ltn012.tex | ||
8 | latex ltn012 | ||
9 | |||
10 | clean: | ||
11 | rm -f *~ *.log *.aux *.bbl *.blg ltn012.pdf ltn012.ps ltn012.dvi ltn012.lof ltn012.toc ltn012.lot | ||
12 | |||
13 | pdf: ltn012.pdf | ||
14 | open ltn012.pdf | ||
diff --git a/gem/myps2pdf b/gem/myps2pdf new file mode 100755 index 0000000..78c23e5 --- /dev/null +++ b/gem/myps2pdf | |||
@@ -0,0 +1,113 @@ | |||
1 | #!/bin/sh - | ||
2 | do_opt=1 | ||
3 | best=0 | ||
4 | rot=0 | ||
5 | a4=0 | ||
6 | eps=0 | ||
7 | usage="Usage: $0 [-no_opt] [-best] [-rot] [-a4] [-eps] in.ps [out.pdf]" | ||
8 | |||
9 | case "x$1" in | ||
10 | "x-no_opt") do_opt=0 ; shift ;; | ||
11 | esac | ||
12 | |||
13 | case "x$1" in | ||
14 | "x-best") best=1 ; shift ;; | ||
15 | esac | ||
16 | |||
17 | case "x$1" in | ||
18 | "x-rot") rot=1 ; shift ;; | ||
19 | esac | ||
20 | |||
21 | case "x$1" in | ||
22 | "x-a4") a4=1 ; shift ;; | ||
23 | esac | ||
24 | |||
25 | case "x$1" in | ||
26 | "x-eps") eps=1 ; shift ;; | ||
27 | esac | ||
28 | |||
29 | case $# in | ||
30 | 2) ifilename=$1 ; ofilename=$2 ;; | ||
31 | 1) ifilename=$1 | ||
32 | if `echo $1 | grep -i '\.e*ps$' > /dev/null` | ||
33 | then | ||
34 | ofilename=`echo $1 | sed 's/\..*$/.pdf/'` | ||
35 | else | ||
36 | echo "$usage" 1>&2 | ||
37 | exit 1 | ||
38 | fi ;; | ||
39 | *) echo "$usage" 1>&2 ; exit 1 ;; | ||
40 | esac | ||
41 | |||
42 | if [ $best == 1 ] | ||
43 | then | ||
44 | options="-dPDFSETTINGS=/prepress \ | ||
45 | -r1200 \ | ||
46 | -dMonoImageResolution=1200 \ | ||
47 | -dGrayImageResolution=1200 \ | ||
48 | -dColorImageResolution=1200 \ | ||
49 | -dDownsampleMonoImages=false \ | ||
50 | -dDownsampleGrayImages=false \ | ||
51 | -dDownsampleColorImages=false \ | ||
52 | -dAutoFilterMonoImages=false \ | ||
53 | -dAutoFilterGrayImages=false \ | ||
54 | -dAutoFilterColorImages=false \ | ||
55 | -dMonoImageFilter=/FlateEncode \ | ||
56 | -dGrayImageFilter=/FlateEncode \ | ||
57 | -dColorImageFilter=/FlateEncode" | ||
58 | else | ||
59 | options="-dPDFSETTINGS=/prepress \ | ||
60 | -r600 \ | ||
61 | -dDownsampleMonoImages=true \ | ||
62 | -dDownsampleGrayImages=true \ | ||
63 | -dDownsampleColorImages=true \ | ||
64 | -dMonoImageDownsampleThreshold=2.0 \ | ||
65 | -dGrayImageDownsampleThreshold=1.5 \ | ||
66 | -dColorImageDownsampleThreshold=1.5 \ | ||
67 | -dMonoImageResolution=600 \ | ||
68 | -dGrayImageResolution=600 \ | ||
69 | -dColorImageResolution=600 \ | ||
70 | -dAutoFilterMonoImages=false \ | ||
71 | -dMonoImageFilter=/FlateEncode \ | ||
72 | -dAutoFilterGrayImages=true \ | ||
73 | -dAutoFilterColorImages=true" | ||
74 | fi | ||
75 | |||
76 | if [ $rot == 1 ] | ||
77 | then | ||
78 | options="$options -dAutoRotatePages=/PageByPage" | ||
79 | fi | ||
80 | |||
81 | if [ $eps == 1 ] | ||
82 | then | ||
83 | options="$options -dEPSCrop" | ||
84 | fi | ||
85 | |||
86 | set -x | ||
87 | |||
88 | if [ $a4 == 1 ] | ||
89 | then | ||
90 | # Resize from A4 to letter size | ||
91 | psresize -Pa4 -pletter "$ifilename" myps2pdf.temp.ps | ||
92 | ifilename=myps2pdf.temp.ps | ||
93 | fi | ||
94 | |||
95 | gs -q -dSAFER -dNOPAUSE -dBATCH \ | ||
96 | -sDEVICE=pdfwrite -sPAPERSIZE=letter -sOutputFile=myps2pdf.temp.pdf \ | ||
97 | -dCompatibilityLevel=1.3 \ | ||
98 | $options \ | ||
99 | -dMaxSubsetPct=100 \ | ||
100 | -dSubsetFonts=true \ | ||
101 | -dEmbedAllFonts=true \ | ||
102 | -dColorConversionStrategy=/LeaveColorUnchanged \ | ||
103 | -dDoThumbnails=true \ | ||
104 | -dPreserveEPSInfo=true \ | ||
105 | -c .setpdfwrite -f "$ifilename" | ||
106 | |||
107 | if [ $do_opt == 1 ] | ||
108 | then | ||
109 | pdfopt myps2pdf.temp.pdf $ofilename | ||
110 | else | ||
111 | mv myps2pdf.temp.pdf $ofilename | ||
112 | fi | ||
113 | rm -f myps2pdf.temp.pdf myps2pdf.temp.ps | ||