-
Notifications
You must be signed in to change notification settings - Fork 0
/
tangle.web
3324 lines (2906 loc) · 127 KB
/
tangle.web
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% This program by D. E. Knuth is not copyrighted and can be used freely.
% Version 0 was released in December, 1981.
% Version 1 was released in September, 1982, with version 0 of TeX.
% Slight changes were made in October, 1982, for version 0.6 of TeX.
% Version 1.2 introduced {:nnn} comments, added @@= and @@\ (December, 1982).
% Version 1.4 added "history" (February, 1983).
% Version 1.5 conformed to TeX version 0.96 and fixed @@\ (March, 1983).
% Version 1.7 introduced the new change file format (June, 1983).
% Version 2.0 was released in July, 1983, with version 0.999 of TeX.
% Version 2.5 was released in November, 1983, with version 1.0 of TeX.
% Version 2.6 fixed a bug: force-line-break after a constant (August, 1984).
% Version 2.7 fixed the definition of check_sum_prime (May, 1985).
% Version 2.8 fixed a bug in change_buffer movement (August, 1985).
% Version 2.9 allows nonnumeric macros before their def (December, 1988).
% Version 3, for Sewell's book, fixed long-line bug in input_ln (March, 1989).
% Version 4 was major change to allow 8-bit input (September, 1989).
% Version 4.1 conforms to ANSI standard for-loop rules (September, 1990).
% Version 4.2 fixes stat report if phase one dies (March, 1991).
% Version 4.3 fixes @@ bug in verbatim, catches extra } (September, 1991).
% Version 4.4 activates debug_help on errors as advertised (February, 1993).
% Version 4.5 prevents modno-comments from being split across lines (Dec 2002).
% Version 4.6 fixes archaic @@z logic; is again big enough for TeX (Jan 2021).
% Here is TeX material that gets inserted after \input webmac
\def\hang{\hangindent 3em\indent\ignorespaces}
\font\ninerm=cmr9
\let\mc=\ninerm % medium caps for names like SAIL
\def\PASCAL{Pascal}
\def\pb{$\.|\ldots\.|$} % Pascal brackets (|...|)
\def\v{\.{\char'174}} % vertical (|) in typewriter font
\mathchardef\BA="3224 % double arrow
\def\({} % kludge for alphabetizing certain module names
\def\title{TANGLE}
\def\contentspagenumber{125} % should be odd
\def\topofcontents{\null\vfill
\titlefalse % include headline on the contents page
\def\rheader{\mainfont Appendix E\hfil \contentspagenumber}
\centerline{\titlefont The {\ttitlefont TANGLE} processor}
\vskip 15pt
\centerline{(Version 4.6)}
\vfill}
\pageno=\contentspagenumber \advance\pageno by 1
@* Introduction.
This program converts a \.{WEB} file to a \PASCAL\ file. It was written
by D. E. Knuth in September, 1981; a somewhat similar {\mc SAIL} program had
been developed in March, 1979. Since this program describes itself, a
bootstrapping process involving hand-translation had to be used to get started.
For large \.{WEB} files one should have a large memory, since \.{TANGLE} keeps
all the \PASCAL\ text in memory (in an abbreviated form). The program uses
a few features of the local \PASCAL\ compiler that may need to be changed in
other installations:
\yskip\item{1)} Case statements have a default.
\item{2)} Input-output routines may need to be adapted for use with a particular
character set and/or for printing messages on the user's terminal.
\yskip\noindent
These features are also present in the \PASCAL\ version of \TeX, where they
are used in a similar (but more complex) way. System-dependent portions
of \.{TANGLE} can be identified by looking at the entries for `system
dependencies' in the index below.
@!@^system dependencies@>
The ``banner line'' defined here should be changed whenever \.{TANGLE}
is modified.
@d banner=='This is TANGLE, Version 4.6'
@ The program begins with a fairly normal header, made up of pieces that
@^system dependencies@>
will mostly be filled in later. The \.{WEB} input comes from files |web_file|
and |change_file|, the \PASCAL\ output goes to file |Pascal_file|,
and the string pool output goes to file |pool|.
If it is necessary to abort the job because of a fatal error, the program
calls the `|jump_out|' procedure, which goes to the label |end_of_TANGLE|.
@d end_of_TANGLE = 9999 {go here to wrap it up}
@p @t\4@>@<Compiler directives@>@/
program TANGLE(@!web_file,@!change_file,@!Pascal_file,@!pool);
label end_of_TANGLE; {go here to finish}
const @<Constants in the outer block@>@/
type @<Types in the outer block@>@/
var @<Globals in the outer block@>@/
@<Error handling procedures@>@/
procedure initialize;
var @<Local variables for initialization@>@/
begin @<Set initial values@>@/
end;
@ Some of this code is optional for use when debugging only;
such material is enclosed between the delimiters |debug| and $|gubed|$.
Other parts, delimited by |stat| and $|tats|$, are optionally included if
statistics about \.{TANGLE}'s memory usage are desired.
@d debug==@{ {change this to `$\\{debug}\equiv\null$' when debugging}
@d gubed==@t@>@} {change this to `$\\{gubed}\equiv\null$' when debugging}
@f debug==begin
@f gubed==end
@#
@d stat==@{ {change this to `$\\{stat}\equiv\null$'
when gathering usage statistics}
@d tats==@t@>@} {change this to `$\\{tats}\equiv\null$'
when gathering usage statistics}
@f stat==begin
@f tats==end
@ The \PASCAL\ compiler used to develop this system has ``compiler
directives'' that can appear in comments whose first character is a dollar sign.
In production versions of \.{TANGLE} these directives tell the compiler that
@^system dependencies@>
it is safe to avoid range checks and to leave out the extra code it inserts
for the \PASCAL\ debugger's benefit, although interrupts will occur if
there is arithmetic overflow.
@<Compiler directives@>=
@{@&$C-,A+,D-@} {no range check, catch arithmetic overflow, no debug overhead}
@!debug @{@&$C+,D+@}@+ gubed {but turn everything on when debugging}
@ Labels are given symbolic names by the following definitions. We insert
the label `|exit|:' just before the `\ignorespaces|end|\unskip' of a
procedure in which we have used the `|return|' statement defined below;
the label `|restart|' is occasionally used at the very beginning of a
procedure; and the label `|reswitch|' is occasionally used just prior to
a \&{case} statement in which some cases change the conditions and we wish to
branch to the newly applicable case.
Loops that are set up with the \&{loop} construction defined below are
commonly exited by going to `|done|' or to `|found|' or to `|not_found|',
and they are sometimes repeated by going to `|continue|'.
@d exit=10 {go here to leave a procedure}
@d restart=20 {go here to start a procedure again}
@d reswitch=21 {go here to start a case statement again}
@d continue=22 {go here to resume a loop}
@d done=30 {go here to exit a loop}
@d found=31 {go here when you've found it}
@d not_found=32 {go here when you've found something else}
@ Here are some macros for common programming idioms.
@d incr(#) == #:=#+1 {increase a variable by unity}
@d decr(#) == #:=#-1 {decrease a variable by unity}
@d loop == @+ while true do@+ {repeat over and over until a |goto| happens}
@d do_nothing == {empty statement}
@d return == goto exit {terminate a procedure call}
@f return == nil
@f loop == xclause
@ We assume that |case| statements may include a default case that applies
if no matching label is found. Thus, we shall use constructions like
@^system dependencies@>
$$\vbox{\halign{#\hfil\cr
|case x of|\cr
1: $\langle\,$code for $x=1\,\rangle$;\cr
3: $\langle\,$code for $x=3\,\rangle$;\cr
|othercases| $\langle\,$code for |x<>1| and |x<>3|$\,\rangle$\cr
|endcases|\cr}}$$
since most \PASCAL\ compilers have plugged this hole in the language by
incorporating some sort of default mechanism. For example, the compiler
used to develop \.{WEB} and \TeX\ allows `|others|:' as a default label,
and other \PASCAL s allow syntaxes like `\ignorespaces|else|\unskip' or
`\&{otherwise}' or `\\{otherwise}:', etc. The definitions of |othercases|
and |endcases| should be changed to agree with local conventions. (Of
course, if no default mechanism is available, the |case| statements of
this program must be extended by listing all remaining cases. The author
would have taken the trouble to modify \.{TANGLE} so that such extensions
were done automatically, if he had not wanted to encourage \PASCAL\
compiler writers to make this important change in \PASCAL, where it belongs.)
@d othercases == others: {default for cases not listed explicitly}
@d endcases == @+end {follows the default case in an extended |case| statement}
@f othercases == else
@f endcases == end
@ The following parameters are set big enough to handle \TeX, so they
should be sufficient for most applications of \.{TANGLE}.
@<Constants...@>=
@!buf_size=100; {maximum length of input line}
@!max_bytes=45000; {|1/ww| times the number of bytes in identifiers,
strings, and module names; must be less than 65536}
@!max_toks=65000; {|1/zz| times the number of bytes in compressed \PASCAL\ code;
must be less than 65536}
@!max_names=4000; {number of identifiers, strings, module names;
must be less than 10240}
@!max_texts=2000; {number of replacement texts, must be less than 10240}
@!hash_size=353; {should be prime}
@!longest_name=400; {module names shouldn't be longer than this}
@!line_length=72; {lines of \PASCAL\ output have at most this many characters}
@!out_buf_size=144; {length of output buffer, should be twice |line_length|}
@!stack_size=50; {number of simultaneous levels of macro expansion}
@!max_id_length=12; {long identifiers are chopped to this length, which must
not exceed |line_length|}
@!unambig_length=7; {identifiers must be unique if chopped to this length}
{note that 7 is more strict than \PASCAL's 8, but this can be varied}
@ A global variable called |history| will contain one of four values
at the end of every run: |spotless| means that no unusual messages were
printed; |harmless_message| means that a message of possible interest
was printed but no serious errors were detected; |error_message| means that
at least one error was found; |fatal_message| means that the program
terminated abnormally. The value of |history| does not influence the
behavior of the program; it is simply computed for the convenience
of systems that might want to use such information.
@d spotless=0 {|history| value for normal jobs}
@d harmless_message=1 {|history| value when non-serious info was printed}
@d error_message=2 {|history| value when an error was noted}
@d fatal_message=3 {|history| value when we had to stop prematurely}
@#
@d mark_harmless==@t@>@+if history=spotless then history:=harmless_message
@d mark_error==history:=error_message
@d mark_fatal==history:=fatal_message
@<Glob...@>=@!history:spotless..fatal_message; {how bad was this run?}
@ @<Set init...@>=history:=spotless;
@* The character set.
One of the main goals in the design of \.{WEB} has been to make it readily
portable between a wide variety of computers. Yet \.{WEB} by its very
nature must use a greater variety of characters than most computer
programs deal with, and character encoding is one of the areas in which
existing machines differ most widely from each other.
To resolve this problem, all input to \.{WEAVE} and \.{TANGLE} is converted
to an internal eight-bit code that is essentially standard ASCII, the ``American
Standard Code for Information Interchange.'' The conversion is done
immediately when each character is read in. Conversely, characters are
converted from ASCII to the user's external representation just before
they are output. (The original ASCII code was seven bits only; \.{WEB} now
allows eight bits in an attempt to keep up with modern times.)
Such an internal code is relevant to users of \.{WEB} only because it is
the code used for preprocessed constants like \.{"A"}. If you are writing
a program in \.{WEB} that makes use of such one-character constants, you
should convert your input to ASCII form, like \.{WEAVE} and \.{TANGLE} do.
Otherwise \.{WEB}'s internal coding scheme does not affect you.
@^ASCII code@>
Here is a table of the standard visible ASCII codes:
$$\def\:{\char\count255\global\advance\count255 by 1}
\count255='40
\vbox{
\hbox{\hbox to 40pt{\it\hfill0\/\hfill}%
\hbox to 40pt{\it\hfill1\/\hfill}%
\hbox to 40pt{\it\hfill2\/\hfill}%
\hbox to 40pt{\it\hfill3\/\hfill}%
\hbox to 40pt{\it\hfill4\/\hfill}%
\hbox to 40pt{\it\hfill5\/\hfill}%
\hbox to 40pt{\it\hfill6\/\hfill}%
\hbox to 40pt{\it\hfill7\/\hfill}}
\vskip 4pt
\hrule
\def\^{\vrule height 10.5pt depth 4.5pt}
\halign{\hbox to 0pt{\hskip -24pt\O{#0}\hfill}&\^
\hbox to 40pt{\tt\hfill#\hfill\^}&
&\hbox to 40pt{\tt\hfill#\hfill\^}\cr
04&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
05&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
06&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
07&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
10&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
11&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
12&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
13&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
14&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
15&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
16&\:&\:&\:&\:&\:&\:&\:&\:\cr\noalign{\hrule}
17&\:&\:&\:&\:&\:&\:&\:\cr}
\hrule width 280pt}$$
(Actually, of course, code @'040 is an invisible blank space.) Code @'136
was once an upward arrow (\.{\char'13}), and code @'137 was
once a left arrow (\.^^X), in olden times when the first draft
of ASCII code was prepared; but \.{WEB} works with today's standard
ASCII in which those codes represent circumflex and underline as shown.
@<Types...@>=
@!ASCII_code=0..255; {eight-bit numbers, a subrange of the integers}
@ The original \PASCAL\ compiler was designed in the late 60s, when six-bit
character sets were common, so it did not make provision for lowercase
letters. Nowadays, of course, we need to deal with both capital and small
letters in a convenient way, so \.{WEB} assumes that it is being used
with a \PASCAL\ whose character set contains at least the characters of
standard ASCII as listed above. Some \PASCAL\ compilers use the original
name |char| for the data type associated with the characters in text files,
while other \PASCAL s consider |char| to be a 64-element subrange of a larger
data type that has some other name.
In order to accommodate this difference, we shall use the name |text_char|
to stand for the data type of the characters in the input and output
files. We shall also assume that |text_char| consists of the elements
|chr(first_text_char)| through |chr(last_text_char)|, inclusive. The
following definitions should be adjusted if necessary.
@^system dependencies@>
@d text_char == char {the data type of characters in text files}
@d first_text_char=0 {ordinal number of the smallest element of |text_char|}
@d last_text_char=255 {ordinal number of the largest element of |text_char|}
@<Types...@>=
@!text_file=packed file of text_char;
@ The \.{WEAVE} and \.{TANGLE} processors convert between ASCII code and
the user's external character set by means of arrays |xord| and |xchr|
that are analogous to \PASCAL's |ord| and |chr| functions.
@<Globals...@>=
@!xord: array [text_char] of ASCII_code;
{specifies conversion of input characters}
@!xchr: array [ASCII_code] of text_char;
{specifies conversion of output characters}
@ If we assume that every system using \.{WEB} is able to read and write the
visible characters of standard ASCII (although not necessarily using the
ASCII codes to represent them), the following assignment statements initialize
most of the |xchr| array properly, without needing any system-dependent
changes. For example, the statement \.{xchr[@@\'101]:=\'A\'} that appears
in the present \.{WEB} file might be encoded in, say, {\mc EBCDIC} code
on the external medium on which it resides, but \.{TANGLE} will convert from
this external code to ASCII and back again. Therefore the assignment
statement \.{XCHR[65]:=\'A\'} will appear in the corresponding \PASCAL\ file,
and \PASCAL\ will compile this statement so that |xchr[65]| receives the
character \.A in the external (|char|) code. Note that it would be quite
incorrect to say \.{xchr[@@\'101]:="A"}, because |"A"| is a constant of
type |integer|, not |char|, and because we have $|"A"|=65$ regardless of
the external character set.
@<Set init...@>=
xchr[@'40]:=' ';
xchr[@'41]:='!';
xchr[@'42]:='"';
xchr[@'43]:='#';
xchr[@'44]:='$';
xchr[@'45]:='%';
xchr[@'46]:='&';
xchr[@'47]:='''';@/
xchr[@'50]:='(';
xchr[@'51]:=')';
xchr[@'52]:='*';
xchr[@'53]:='+';
xchr[@'54]:=',';
xchr[@'55]:='-';
xchr[@'56]:='.';
xchr[@'57]:='/';@/
xchr[@'60]:='0';
xchr[@'61]:='1';
xchr[@'62]:='2';
xchr[@'63]:='3';
xchr[@'64]:='4';
xchr[@'65]:='5';
xchr[@'66]:='6';
xchr[@'67]:='7';@/
xchr[@'70]:='8';
xchr[@'71]:='9';
xchr[@'72]:=':';
xchr[@'73]:=';';
xchr[@'74]:='<';
xchr[@'75]:='=';
xchr[@'76]:='>';
xchr[@'77]:='?';@/
xchr[@'100]:='@@';
xchr[@'101]:='A';
xchr[@'102]:='B';
xchr[@'103]:='C';
xchr[@'104]:='D';
xchr[@'105]:='E';
xchr[@'106]:='F';
xchr[@'107]:='G';@/
xchr[@'110]:='H';
xchr[@'111]:='I';
xchr[@'112]:='J';
xchr[@'113]:='K';
xchr[@'114]:='L';
xchr[@'115]:='M';
xchr[@'116]:='N';
xchr[@'117]:='O';@/
xchr[@'120]:='P';
xchr[@'121]:='Q';
xchr[@'122]:='R';
xchr[@'123]:='S';
xchr[@'124]:='T';
xchr[@'125]:='U';
xchr[@'126]:='V';
xchr[@'127]:='W';@/
xchr[@'130]:='X';
xchr[@'131]:='Y';
xchr[@'132]:='Z';
xchr[@'133]:='[';
xchr[@'134]:='\';
xchr[@'135]:=']';
xchr[@'136]:='^';
xchr[@'137]:='_';@/
xchr[@'140]:='`';
xchr[@'141]:='a';
xchr[@'142]:='b';
xchr[@'143]:='c';
xchr[@'144]:='d';
xchr[@'145]:='e';
xchr[@'146]:='f';
xchr[@'147]:='g';@/
xchr[@'150]:='h';
xchr[@'151]:='i';
xchr[@'152]:='j';
xchr[@'153]:='k';
xchr[@'154]:='l';
xchr[@'155]:='m';
xchr[@'156]:='n';
xchr[@'157]:='o';@/
xchr[@'160]:='p';
xchr[@'161]:='q';
xchr[@'162]:='r';
xchr[@'163]:='s';
xchr[@'164]:='t';
xchr[@'165]:='u';
xchr[@'166]:='v';
xchr[@'167]:='w';@/
xchr[@'170]:='x';
xchr[@'171]:='y';
xchr[@'172]:='z';
xchr[@'173]:='{';
xchr[@'174]:='|';
xchr[@'175]:='}';
xchr[@'176]:='~';@/
xchr[0]:=' '; xchr[@'177]:=' '; {these ASCII codes are not used}
@ Some of the ASCII codes below @'40 have been given symbolic names in
\.{WEAVE} and \.{TANGLE} because they are used with a special meaning.
@d and_sign=@'4 {equivalent to `\.{and}'}
@d not_sign=@'5 {equivalent to `\.{not}'}
@d set_element_sign=@'6 {equivalent to `\.{in}'}
@d tab_mark=@'11 {ASCII code used as tab-skip}
@d line_feed=@'12 {ASCII code thrown away at end of line}
@d form_feed=@'14 {ASCII code used at end of page}
@d carriage_return=@'15 {ASCII code used at end of line}
@d left_arrow=@'30 {equivalent to `\.{:=}'}
@d not_equal=@'32 {equivalent to `\.{<>}'}
@d less_or_equal=@'34 {equivalent to `\.{<=}'}
@d greater_or_equal=@'35 {equivalent to `\.{>=}'}
@d equivalence_sign=@'36 {equivalent to `\.{==}'}
@d or_sign=@'37 {equivalent to `\.{or}'}
@ When we initialize the |xord| array and the remaining parts of |xchr|,
it will be convenient to make use of an index variable, |i|.
@<Local variables for init...@>=
@!i:0..255;
@ Here now is the system-dependent part of the character set.
If \.{WEB} is being implemented on a garden-variety \PASCAL\ for which
only standard ASCII codes will appear in the input and output files, you
don't need to make any changes here. But if you have, for example, an extended
character set like the one in Appendix~C of {\sl The \TeX book}, the first
line of code in this module should be changed to
$$\hbox{|for i:=1 to @'37 do xchr[i]:=chr(i);|}$$
\.{WEB}'s character set is essentially identical to \TeX's, even with respect to
characters less than @'40.
@^system dependencies@>
Changes to the present module will make \.{WEB} more friendly on computers
that have an extended character set, so that one can type things like
\.^^Z\ instead of \.{<>}. If you have an extended set of characters that
are easily incorporated into text files, you can assign codes arbitrarily
here, giving an |xchr| equivalent to whatever characters the users of
\.{WEB} are allowed to have in their input files, provided that unsuitable
characters do not correspond to special codes like |carriage_return|
that are listed above.
(The present file \.{TANGLE.WEB} does not contain any of the non-ASCII
characters, because it is intended to be used with all implementations of
\.{WEB}. It was originally created on a Stanford system that has a
convenient extended character set, then ``sanitized'' by applying another
program that transliterated all of the non-standard characters into
standard equivalents.)
@<Set init...@>=
for i:=1 to @'37 do xchr[i]:=' ';
for i:=@'200 to @'377 do xchr[i]:=' ';
@ The following system-independent code makes the |xord| array contain a
suitable inverse to the information in |xchr|.
@<Set init...@>=
for i:=first_text_char to last_text_char do xord[chr(i)]:=" ";
for i:=1 to @'377 do xord[xchr[i]]:=i;
xord[' ']:=" ";
@* Input and output.
The input conventions of this program are intended to be very much like those
of \TeX\ (except, of course, that they are much simpler, because much less
needs to be done). Furthermore they are identical to those of \.{WEAVE}.
Therefore people who need to make modifications to all three systems
should be able to do so without too many headaches.
We use the standard \PASCAL\ input/output procedures in several places that
\TeX\ cannot, since \.{TANGLE} does not have to deal with files that are named
dynamically by the user, and since there is no input from the terminal.
@ Terminal output is done by writing on file |term_out|, which is assumed to
consist of characters of type |text_char|:
@^system dependencies@>
@d print(#)==write(term_out,#) {`|print|' means write on the terminal}
@d print_ln(#)==write_ln(term_out,#) {`|print|' and then start new line}
@d new_line==write_ln(term_out) {start new line}
@d print_nl(#)== {print information starting on a new line}
begin new_line; print(#);
end
@<Globals...@>=
@!term_out:text_file; {the terminal as an output file}
@ Different systems have different ways of specifying that the output on a
certain file will appear on the user's terminal. Here is one way to do this
on the \PASCAL\ system that was used in \.{TANGLE}'s initial development:
@^system dependencies@>
@<Set init...@>=
rewrite(term_out,'TTY:'); {send |term_out| output to the terminal}
@ The |update_terminal| procedure is called when we want
to make sure that everything we have output to the terminal so far has
actually left the computer's internal buffers and been sent.
@^system dependencies@>
@d update_terminal == break(term_out) {empty the terminal output buffer}
@ The main input comes from |web_file|; this input may be overridden
by changes in |change_file|. (If |change_file| is empty, there are no changes.)
@<Globals...@>=
@!web_file:text_file; {primary input}
@!change_file:text_file; {updates}
@ The following code opens the input files. Since these files were listed
in the program header, we assume that the \PASCAL\ runtime system has
already checked that suitable file names have been given; therefore no
additional error checking needs to be done.
@^system dependencies@>
@p procedure open_input; {prepare to read |web_file| and |change_file|}
begin reset(web_file); reset(change_file);
end;
@ The main output goes to |Pascal_file|, and string pool constants are
written to the |pool| file.
@<Globals...@>=
@!Pascal_file: text_file;
@!pool: text_file;
@ The following code opens |Pascal_file| and |pool|.
Since these files were listed in the program header, we assume that the
\PASCAL\ runtime system has checked that suitable external file names have
been given.
@^system dependencies@>
@<Set init...@>=
rewrite(Pascal_file); rewrite(pool);
@ Input goes into an array called |buffer|.
@<Globals...@>=@!buffer: array[0..buf_size] of ASCII_code;
@ The |input_ln| procedure brings the next line of input from the specified
file into the |buffer| array and returns the value |true|, unless the file has
already been entirely read, in which case it returns |false|. The conventions
of \TeX\ are followed; i.e., |ASCII_code| numbers representing the next line
of the file are input into |buffer[0]|, |buffer[1]|, \dots,
|buffer[limit-1]|; trailing blanks are ignored;
and the global variable |limit| is set to the length of the
@^system dependencies@>
line. The value of |limit| must be strictly less than |buf_size|.
We assume that none of the |ASCII_code| values
of |buffer[j]| for |0<=j<limit| is equal to 0, @'177, |line_feed|, |form_feed|,
or |carriage_return|.
@p function input_ln(var f:text_file):boolean;
{inputs a line or returns |false|}
var final_limit:0..buf_size; {|limit| without trailing blanks}
begin limit:=0; final_limit:=0;
if eof(f) then input_ln:=false
else begin while not eoln(f) do
begin buffer[limit]:=xord[f^]; get(f);
incr(limit);
if buffer[limit-1]<>" " then final_limit:=limit;
if limit=buf_size then
begin while not eoln(f) do get(f);
decr(limit); {keep |buffer[buf_size]| empty}
if final_limit>limit then final_limit:=limit;
print_nl('! Input line too long'); loc:=0; error;
@.Input line too long@>
end;
end;
read_ln(f); limit:=final_limit; input_ln:=true;
end;
end;
@* Reporting errors to the user.
The \.{TANGLE} processor operates in two phases: first it inputs the source
file and stores a compressed representation of the program, then it produces
the \PASCAL\ output from the compressed representation.
The global variable |phase_one| tells whether we are in Phase I or not.
@<Globals...@>=
@!phase_one: boolean; {|true| in Phase I, |false| in Phase II}
@ If an error is detected while we are debugging,
we usually want to look at the contents of memory.
A special procedure will be declared later for this purpose.
@<Error handling...@>=
@!debug @+ procedure debug_help; forward;@+ gubed
@ During the first phase, syntax errors are reported to the user by saying
$$\hbox{`|err_print('! Error message')|'},$$
followed by `|jump_out|' if no recovery from the error is provided.
This will print the error message followed by an indication of where the error
was spotted in the source file. Note that no period follows the error message,
since the error routine will automatically supply a period.
Errors that are noticed during the second phase are reported to the user
in the same fashion, but the error message will be
followed by an indication of where the error was spotted in the output file.
The actual error indications are provided by a procedure called |error|.
@d err_print(#)==begin new_line; print(#); error;
end
@<Error handling...@>=
procedure error; {prints '\..' and location of error message}
var j: 0..out_buf_size; {index into |out_buf|}
@!k,@!l: 0..buf_size; {indices into |buffer|}
begin if phase_one then @<Print error location based on input buffer@>
else @<Print error location based on output buffer@>;
update_terminal; mark_error;
@!debug debug_skipped:=debug_cycle; debug_help;@+gubed
end;
@ The error locations during Phase I can be indicated by using the global
variables |loc|, |line|, and |changing|, which tell respectively the first
unlooked-at position in |buffer|, the current line number, and whether or not
the current line is from |change_file| or |web_file|.
This routine should be modified on systems whose standard text editor
has special line-numbering conventions.
@^system dependencies@>
@<Print error location based on input buffer@>=
begin if changing then print('. (change file ')@+else print('. (');
print_ln('l.', line:1, ')');
if loc>=limit then l:=limit else l:=loc;
for k:=1 to l do
if buffer[k-1]=tab_mark then print(' ')
else print(xchr[buffer[k-1]]); {print the characters already read}
new_line;
for k:=1 to l do print(' '); {space out the next line}
for k:=l+1 to limit do print(xchr[buffer[k-1]]); {print the part not yet read}
print(' '); {this space separates the message from future asterisks}
end
@ The position of errors detected during the second phase can be indicated
by outputting the partially-filled output buffer, which contains |out_ptr|
entries.
@<Print error location based on output...@>=
begin print_ln('. (l.',line:1,')');
for j:=1 to out_ptr do print(xchr[out_buf[j-1]]); {print current partial line}
print('... '); {indicate that this information is partial}
end
@ The |jump_out| procedure just cuts across all active procedure levels
and jumps out of the program. This is the only non-local |goto| statement
in \.{TANGLE}. It is used when no recovery from a particular error has
been provided.
Some \PASCAL\ compilers do not implement non-local |goto| statements.
@^system dependencies@>
In such cases the code that appears at label |end_of_TANGLE| should be
copied into the |jump_out| procedure, followed by a call to a system procedure
that terminates the program.
@d fatal_error(#)==begin new_line; print(#); error; mark_fatal; jump_out;
end
@<Error handling...@>=
procedure jump_out;
begin goto end_of_TANGLE;
end;
@ Sometimes the program's behavior is far different from what it should be,
and \.{TANGLE} prints an error message that is really for the \.{TANGLE}
maintenance person, not the user. In such cases the program says
|confusion('indication of where we are')|.
@d confusion(#)==fatal_error('! This can''t happen (',#,')')
@.This can't happen@>
@ An overflow stop occurs if \.{TANGLE}'s tables aren't large enough.
@d overflow(#)==fatal_error('! Sorry, ',#,' capacity exceeded')
@.Sorry, x capacity exceeded@>
@* Data structures.
Most of the user's \PASCAL\ code is packed into eight-bit integers
in two large arrays called |byte_mem| and |tok_mem|.
The |byte_mem| array holds the names of identifiers, strings, and modules;
the |tok_mem| array holds the replacement texts
for macros and modules. Allocation is sequential, since things are deleted only
during Phase II, and only in a last-in-first-out manner.
Auxiliary arrays |byte_start| and |tok_start| are used as directories to
|byte_mem| and |tok_mem|, and the |link|, |ilk|, |equiv|, and |text_link|
arrays give further information about names. These auxiliary arrays
consist of sixteen-bit items.
@<Types...@>=
@!eight_bits=0..255; {unsigned one-byte quantity}
@!sixteen_bits=0..65535; {unsigned two-byte quantity}
@ \.{TANGLE} has been designed to avoid the need for indices that are more
than sixteen bits wide, so that it can be used on most computers. But
there are programs that need more than 65536 tokens, and some programs
even need more than 65536 bytes; \TeX\ is one of these. To get around
this problem, a slight complication has been added to the data structures:
|byte_mem| and |tok_mem| are two-dimensional arrays, whose first index is
either 0 or 1 or 2. (For generality, the first index is actually allowed to run
between 0 and |ww-1| in |byte_mem|, or between 0 and |zz-1| in |tok_mem|,
where |ww| and |zz| are set to 2 and~3; the program will work for any
positive values of |ww| and |zz|, and it can be simplified in obvious ways
if |ww=1| or |zz=1|.)
@d ww=2 {we multiply the byte capacity by approximately this amount}
@d zz=3 {we multiply the token capacity by approximately this amount}
@<Globals...@>=
@!byte_mem: packed array [0..ww-1,0..max_bytes] of ASCII_code;
{characters of names}
@!tok_mem: packed array [0..zz-1,0..max_toks] of eight_bits; {tokens}
@!byte_start: array [0..max_names] of sixteen_bits; {directory into |byte_mem|}
@!tok_start: array [0..max_texts] of sixteen_bits; {directory into |tok_mem|}
@!link: array [0..max_names] of sixteen_bits; {hash table or tree links}
@!ilk: array [0..max_names] of sixteen_bits; {type codes or tree links}
@!equiv: array [0..max_names] of sixteen_bits; {info corresponding to names}
@!text_link: array [0..max_texts] of sixteen_bits; {relates replacement texts}
@ The names of identifiers are found by computing a hash address |h| and
then looking at strings of bytes signified by |hash[h]|, |link[hash[h]]|,
|link[link[hash[h]]]|, \dots, until either finding the desired name
or encountering a zero.
A `|name_pointer|' variable, which signifies a name, is an index into
|byte_start|. The actual sequence of characters in the name pointed to by
|p| appears in positions |byte_start[p]| to |byte_start[p+ww]-1|, inclusive,
in the segment of |byte_mem| whose first index is |p mod ww|. Thus, when
|ww=2| the even-numbered name bytes appear in |byte_mem[0,@t$*$@>]|
and the odd-numbered ones appear in |byte_mem[1,@t$*$@>]|.
The pointer 0 is used for undefined module names; we don't
want to use it for the names of identifiers, since 0 stands for a null
pointer in a linked list.
Strings are treated like identifiers; the first character (a double-quote)
distinguishes a string from an alphabetic name, but for \.{TANGLE}'s purposes
strings behave like numeric macros. (A `string' here refers to the
strings delimited by double-quotes that \.{TANGLE} processes. \PASCAL\
string constants delimited by single-quote marks are not given such special
treatment; they simply appear as sequences of characters in the \PASCAL\
texts.) The total number of strings in the string
pool is called |string_ptr|, and the total number of names in |byte_mem|
is called |name_ptr|. The total number of bytes occupied in
|byte_mem[w,@t$*$@>]| is called |byte_ptr[w]|.
We usually have |byte_start[name_ptr+w]=byte_ptr[(name_ptr+w) mod ww]|
for |0<=w<ww|, since these are the starting positions for the next |ww|
names to be stored in |byte_mem|.
@d length(#)==byte_start[#+ww]-byte_start[#] {the length of a name}
@<Types...@>=
@!name_pointer=0..max_names; {identifies a name}
@ @<Global...@>=
@!name_ptr:name_pointer; {first unused position in |byte_start|}
@!string_ptr:name_pointer; {next number to be given to a string of length |<>1|}
@!byte_ptr:array [0..ww-1] of 0..max_bytes;
{first unused position in |byte_mem|}
@!pool_check_sum:integer; {sort of a hash for the whole string pool}
@ @<Local variables for init...@>=
@!wi: 0..ww-1; {to initialize the |byte_mem| indices}
@ @<Set init...@>=
for wi:=0 to ww-1 do
begin byte_start[wi]:=0; byte_ptr[wi]:=0;
end;
byte_start[ww]:=0; {this makes name 0 of length zero}
name_ptr:=1; string_ptr:=256; pool_check_sum:=271828;
@ Replacement texts are stored in |tok_mem|, using similar conventions.
A `|text_pointer|' variable is an index into |tok_start|, and the
replacement text that corresponds to |p| runs from positions
|tok_start[p]| to |tok_start[p+zz]-1|, inclusive, in the segment of
|tok_mem| whose first index is |p mod zz|. Thus, when |zz=2| the
even-numbered replacement texts appear in |tok_mem[0,@t$*$@>]| and the
odd-numbered ones appear in |tok_mem[1,@t$*$@>]|. Furthermore,
|text_link[p]| is used to connect pieces of text that have the same name,
as we shall see later. The pointer 0 is used for undefined replacement
texts.
The first position of |tok_mem[z,@t$*$@>]| that is unoccupied by
replacement text is called |tok_ptr[z]|, and the first unused location of
|tok_start| is called |text_ptr|. We usually have the identity
|tok_start[text_ptr+z]=tok_ptr[(text_ptr+z) mod zz]|, for |0<=z<zz|, since
these are the starting positions for the next |zz| replacement texts to
be stored in |tok_mem|.
@<Types...@>=
@!text_pointer=0..max_texts; {identifies a replacement text}
@ It is convenient to maintain a variable |z| that is equal to |text_ptr
mod zz|, so that we always insert tokens into segment |z| of |tok_mem|.
@<Glob...@>=
@t\hskip1em@>@!text_ptr:text_pointer; {first unused position in |tok_start|}
@t\hskip1em@>@!tok_ptr:array[0..zz-1] of 0..max_toks;
{first unused position in a given segment of |tok_mem|}
@t\hskip1em@>@!z:0..zz-1; {current segment of |tok_mem|}
stat @!max_tok_ptr:array[0..zz-1] of 0..max_toks;
{largest values assumed by |tok_ptr|}
tats
@ @<Local variables for init...@>=
@!zi:0..zz-1; {to initialize the |tok_mem| indices}
@ @<Set init...@>=
for zi:=0 to zz-1 do
begin tok_start[zi]:=0; tok_ptr[zi]:=0;
end;
tok_start[zz]:=0; {this makes replacement text 0 of length zero}
text_ptr:=1; z:=1 mod zz;
@ Four types of identifiers are distinguished by their |ilk|:
\yskip\hang |normal| identifiers will appear in the \PASCAL\ program as
ordinary identifiers since they have not been defined to be macros; the
corresponding value in the |equiv| array
for such identifiers is a link in a secondary hash table that
is used to check whether any two of them agree in their first |unambig_length|
characters after underline symbols are removed and lowercase letters are
changed to uppercase.
\yskip\hang |numeric| identifiers have been defined to be numeric macros;
their |equiv| value contains the corresponding numeric value plus $2^{15}$.
Strings are treated as numeric macros.
\yskip\hang |simple| identifiers have been defined to be simple macros;
their |equiv| value points to the corresponding replacement text.
\yskip\hang |parametric| identifiers have been defined to be parametric macros;
like simple identifiers, their |equiv| value points to the replacement text.
@d normal=0 {ordinary identifiers have |normal| ilk}
@d numeric=1 {numeric macros and strings have |numeric| ilk}
@d simple=2 {simple macros have |simple| ilk}
@d parametric=3 {parametric macros have |parametric| ilk}
@ The names of modules are stored in |byte_mem| together
with the identifier names, but a hash table is not used for them because
\.{TANGLE} needs to be able to recognize a module name when given a prefix of
that name. A conventional binary search tree is used to retrieve module names,
with fields called |llink| and |rlink| in place of |link| and |ilk|. The
root of this tree is |rlink[0]|. If |p| is a pointer to a module name,
|equiv[p]| points to its replacement text, just as in simple and parametric
macros, unless this replacement text has not yet been defined (in which case
|equiv[p]=0|).
@d llink==link {left link in binary search tree for module names}
@d rlink==ilk {right link in binary search tree for module names}
@<Set init...@>=
rlink[0]:=0; {the binary search tree starts out with nothing in it}
equiv[0]:=0; {the undefined module has no replacement text}
@ Here is a little procedure that prints the text of a given name.
@p procedure print_id(@!p:name_pointer); {print identifier or module name}
var k:0..max_bytes; {index into |byte_mem|}
@!w:0..ww-1; {segment of |byte_mem|}
begin if p>=name_ptr then print('IMPOSSIBLE')
else begin w:=p mod ww;
for k:=byte_start[p] to byte_start[p+ww]-1 do print(xchr[byte_mem[w,k]]);
end;
end;
@* Searching for identifiers.
The hash table described above is updated by the |id_lookup| procedure,
which finds a given identifier and returns a pointer to its index in
|byte_start|. If the identifier was not already present, it is inserted with
a given |ilk| code; and an error message is printed if the identifier is being
doubly defined.
Because of the way \.{TANGLE}'s scanning mechanism works, it is most convenient
to let |id_lookup| search for an identifier that is present in the |buffer|
array. Two other global variables specify its position in the buffer: the
first character is |buffer[id_first]|, and the last is |buffer[id_loc-1]|.
Furthermore, if the identifier is really a string, the global variable
|double_chars| tells how many of the characters in the buffer appear
twice (namely \.{@@@@} and \.{""}), since this additional information makes
it easy to calculate the true length of the string. The final double-quote
of the string is not included in its ``identifier,'' but the first one is,
so the string length is |id_loc-id_first-double_chars-1|.
We have mentioned that |normal| identifiers belong to two hash tables,
one for their true names as they appear in the \.{WEB} file and the other
when they have been reduced to their first |unambig_length| characters.
The hash tables are kept by the method of simple chaining, where the
heads of the individual lists appear in the |hash| and |chop_hash| arrays.
If |h| is a hash code, the primary hash table list starts at |hash[h]| and
proceeds through |link| pointers; the secondary hash table list starts at
|chop_hash[h]| and proceeds through |equiv| pointers. Of course, the same
identifier will probably have two different values of |h|.
The |id_lookup| procedure uses an auxiliary array called |chopped_id| to
contain up to |unambig_length| characters of the current identifier, if
it is necessary to compute the secondary hash code. (This array could be
declared local to |id_lookup|, but in general we are making all array
declarations global in this program, because some compilers and some machine
architectures make dynamic array allocation inefficient.)
@<Glob...@>=
@!id_first:0..buf_size; {where the current identifier begins in the buffer}
@!id_loc:0..buf_size; {just after the current identifier in the buffer}
@!double_chars:0..buf_size; {correction to length in case of strings}
@#
@!hash,@!chop_hash:array [0..hash_size] of sixteen_bits; {heads of hash lists}
@!chopped_id:array [0..unambig_length] of ASCII_code; {chopped identifier}
@ Initially all the hash lists are empty.
@<Local variables for init...@>=
@!h:0..hash_size; {index into hash-head arrays}
@ @<Set init...@>=
for h:=0 to hash_size-1 do
begin hash[h]:=0; chop_hash[h]:=0;
end;
@ Here now is the main procedure for finding identifiers (and strings).
The parameter |t| is set to |normal| except when the identifier is
a macro name that is just being defined; in the latter case, |t| will be
|numeric|, |simple|, or |parametric|.
@p function id_lookup(@!t:eight_bits):name_pointer; {finds current identifier}
label found, not_found;
var c:eight_bits; {byte being chopped}
@!i:0..buf_size; {index into |buffer|}
@!h:0..hash_size; {hash code}
@!k:0..max_bytes; {index into |byte_mem|}
@!w:0..ww-1; {segment of |byte_mem|}
@!l:0..buf_size; {length of the given identifier}
@!p,@!q:name_pointer; {where the identifier is being sought}
@!s:0..unambig_length; {index into |chopped_id|}
begin l:=id_loc-id_first; {compute the length}
@<Compute the hash code |h|@>;
@<Compute the name location |p|@>;
if (p=name_ptr)or(t<>normal) then
@<Update the tables and check for possible errors@>;
id_lookup:=p;
end;
@ A simple hash code is used: If the sequence of
ASCII codes is $c_1c_2\ldots c_n$, its hash value will be
$$(2^{n-1}c_1+2^{n-2}c_2+\cdots+c_n)\,\bmod\,|hash_size|.$$
@<Compute the hash...@>=
h:=buffer[id_first]; i:=id_first+1;
while i<id_loc do
begin h:=(h+h+buffer[i]) mod hash_size; incr(i);
end
@ If the identifier is new, it will be placed in position |p=name_ptr|,
otherwise |p| will point to its existing location.
@<Compute the name location...@>=
p:=hash[h];
while p<>0 do
begin if length(p)=l then
@<Compare name |p| with current identifier, |goto found| if equal@>;
p:=link[p];
end;
p:=name_ptr; {the current identifier is new}