-
Notifications
You must be signed in to change notification settings - Fork 11
/
index.html
1491 lines (1320 loc) · 61.2 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<title>Indic Layout Requirements</title>
<meta charset="utf-8"/>
<script src="https://www.w3.org/Tools/respec/respec-w3c-common" async class="remove"></script>
<script class="remove">
var respecConfig = {
// specification status (e.g. WD, LCWD, WG-NOTE, etc.). If in doubt use ED.
specStatus: "ED",
//publishDate: "2017-02-20",
previousPublishDate: "2017-02-20",
previousMaturity: "WD",
noRecTrack: true,
shortName: "ilreq",
copyrightStart: "2014",
edDraftURI: "https://w3c.github.io/ilreq/",
editors: [
{ name: "Swaran Lata", mailto: "slata@deity.gov.in", company: "DeitY", w3cid: 39008 },
{ name: "Somnath Chandra", mailto: "schandra@deity.gov.in", company: "DeitY" },
{ name: "Prashant Verma", mailto: "vermaprashant1@gmail.com", company: "Web Standardization Initiative, DeitY" },
],
wg: "Internationalization Working Group",
wgURI: "https://www.w3.org/International/core/",
wgPublicList: "public-i18n-indic",
github: "w3c/ilreq",
wgPatentURI: "https://www.w3.org/2004/01/pp-impl/32113/status",
localBiblio: {
"Code-Charts": {
title: "Unicode Code Charts",
href: "http://www.unicode.org/charts/",
},
"Evolution-of-Indic-Scripts": {
title: "Indic Scripts",
href: "http://www.ciillibrary.org/Sites/Photography/PhotographyHome.html",
},
"CLDR": {
title: "Unicode CLDR",
href: "http://cldr.unicode.org",
},
"South-Asian-Scripts": {
title: "Unicode Technical note#10 : South Asian Scripts",
href: "http://www.unicode.org/notes/tn10/",
},
"UAX29": {
title: "Grapheme Cluster boundaries",
href: "http://www.unicode.org/reports/tr29/",
},
"UAX14": {
title: "Unicode Line Breaking Algorithm",
href: "http://www.unicode.org/reports/tr14/",
},
"Normalization": {
title: "Unicode Normalization",
href: "http://unicode.org/reports/tr15/",
},
"Draft-Script-Grammar": {
title: "Draft-Scrip-Grammer Devanagari",
href: "http://tdil-dc.in/index.php?option=com_vertical&task=view-article&article_id=149&lang=en",
}
}
};
</script>
<link rel="stylesheet" href="local.css" type="text/css" />
</head>
<body>
<div id="abstract">
<p>This document describes the basic requirements for Indic script layout and text support on the Web and in Digital Publications. These requirements provide information for Web technologies such as CSS, HTML, and SVG about how to support users of Indic scripts. The current document focuses on Devanagari, but there are plans to widen the scope to encompass additional Indian scripts as time goes on.</p>
</div>
<div id="sotd">
<p>This document describes the basic requirements for Indic script layout and text support on the Web and in eBooks. These requirements provide information for Web technologies such as CSS, HTML and SVG about how to support users of Indic scripts. The current document focuses on Devanagari, but there are plans to widen the scope to encompass additional Indian scripts as time goes on. </p>
<p>The editor's draft of this document is being developed by the <a href="https://www.w3.org/International/groups/indic-layout/">Indic Layout Task Force</a>, part of the W3C <a href="https://www.w3.org/International/ig/">Internationalization Interest Group</a>. It is published by the <a href="https://www.w3.org/International/core/">Internationalization Working Group</a>. The end target for this document is a Working Group Note.</p>
<div class="note">
<p data-lang="en" style="font-weight: bold; font-size: 120%">Sending comments on this document</p>
<p data-lang="en">If you wish to make comments regarding this document, please raise them as <a href="https://github.com/w3c/ilreq/issues" style="font-size: 120%;">github issues</a>. Only send comments by email if you are unable to raise issues on github (see links below). All comments are welcome.</p>
<p data-lang="en">To make it easier to track comments, please raise separate issues or emails for each comment, and point to the section you are commenting on using a URL for the dated version of the document.</p>
</div>
</div>
<section id="h_introduction">
<h2>Introduction</h2>
<section id="h_about_this_document">
<h2>About this document</h2>
<p>This document describes the basic requirements for Indian Languages layout for display purpose. It discusses some of the major layout requirements in first letter pseudo-element, vertical arrangements of characters, letter spacing, text segmentation, line breaking rules in Indic languages.</p>
<p>The current document focuses on Devanagari, but there are plans to widen the scope to encompass additional Indian scripts as time goes on.</p>
<p>The minimal requirements presented in this document for Indian languages text layout will also be used in E-publishing and CSS standards. This documents covers major issues of e-content in Indian languages in order to create a standard format of text layout to address storage, rendering problems, vertical writing, letter spacing, line breaking etc.</p>
<p>It also describes a set of <abbr title="Augmented Backus–Naur Form">ABNF</abbr>-based rules for valid segmentation of Indic orthographic syllables in order to get the proper display in browsers. Text segmentation[[!UAX29]] and line breaking [[!UAX14]] algorithms are considered in detail. Standards for CSS and digital publications will benefit from this document.</p>
</section>
<section id="h_indian_language_complexities">
<h2>Indian language complexities</h2>
<p> India has large linguistic diversity with 22 constitutionally recognized languages and 12 scripts.This document is currently focused largely on the Devanagari script. The expectation is that over time its scope will widen to cover additional major scripts from the list below.</p>
<p>The mapping between languages and scripts is complex. Multiple languages may have common scripts, while a language can be written in multiple scripts. Each language and script is unique in nature and cannot be easily replicated, even if they share common characteristics. The orthographic changes may also occur in some languages and adoption of new orthography is a gradual process, thus posing additional challenges.</p>
<table class="tab-format">
<tr>
<td>Serial No.</td>
<td>Language</td>
<td>Script</td>
</tr>
<tr>
<td>1</td>
<td>Hindi</td>
<td>Devanagari</td>
</tr>
<tr>
<td>2</td>
<td>Sanskrit</td>
<td>Devanagari</td>
</tr>
<tr>
<td>3</td>
<td>Marathi</td>
<td>Devanagari</td>
</tr>
<tr>
<td>4</td>
<td>Konkani</td>
<td>Devanagari</td>
</tr>
<tr>
<td>5</td>
<td>Nepali</td>
<td>Devanagari</td>
</tr>
<tr>
<td>6</td>
<td>Maithili</td>
<td>Devanagari</td>
</tr>
<tr>
<td>7</td>
<td>Sindhi</td>
<td>Devanagari, Perso-Arabic</td>
</tr>
<tr>
<td>8</td>
<td>Bodo</td>
<td>Devanagari</td>
</tr>
<tr>
<td>9</td>
<td>Dogri</td>
<td>Devanagari</td>
</tr>
<tr>
<td>10</td>
<td>Bengali</td>
<td>Bengali</td>
</tr>
<tr>
<td>11</td>
<td>Assamese</td>
<td>Bengali</td>
</tr>
<tr>
<td>12</td>
<td>Manipuri</td>
<td>Bengali, Meetei (Mayak)</td>
</tr>
<tr>
<td>13</td>
<td>Gujarati</td>
<td>Gujarati</td>
</tr>
<tr>
<td>14</td>
<td>Kannada</td>
<td>Kannada</td>
</tr>
<tr>
<td>15</td>
<td>Malayalam</td>
<td>Malayalam</td>
</tr>
<tr>
<td>16</td>
<td>Odia</td>
<td>Odia</td>
</tr>
<tr>
<td>17</td>
<td>Punjabi</td>
<td>Gurmukhi</td>
</tr>
<tr>
<td>18</td>
<td>Tamil</td>
<td>Tamil</td>
</tr>
<tr>
<td>19</td>
<td>Telugu</td>
<td>Telugu</td>
</tr>
<tr>
<td>20</td>
<td>Urdu</td>
<td>Perso-Arabic</td>
</tr>
<tr>
<td>21</td>
<td>Santhali</td>
<td>Ol-Chiki, Devanagari</td>
</tr>
<tr>
<td>22</td>
<td>Kashmiri</td>
<td>Devanagari, Perso-Arabic</td>
</tr>
</table>
<p>The scripts of South Asia share so many common features that a side-by-side comparison
of a few will often reveal structural similarities, even in modern letter forms. They are all
abugidas in which most symbols stand for a consonant with an inherent vowel.The North Indian branch of scripts was, like Brahmi itself, mainly used to write Indo-European languages such as Pali and Sanskrit, and eventually the Hindi, Bengali, and Gujarati
language, though it was also the source for scripts for non-Indo-European languages such
as Tibetan, Mongolian, and Lepcha, as well as many South-East Asian scripts. The South Indian scripts are also derived from Brahmi and, therefore, share many similarities in structural characteristics. For more details visit [[!South-Asian-Scripts]].</p>
<p><a href="#fig_script_development"></a> shows the evolution of Indian scripts over a period of time from Brahmi script.</p>
<figure id="fig_script_development"> <img src="images/Brahmi.jpg" width="617" height="637" alt="Evolution of Indic Scripts"/>
<figcaption>Development of Indian Scripts</figcaption>
</figure>
For more details visit [[!Evolution-of-Indic-Scripts]]</section>
<section id="h_basic_components_of_indian_languages">
<h3>Basic components of Indian languages</h3>
<section id="h_unicode_and_cldr">
<h4>Unicode & CLDR</h4>
<p><dfn id="def_unicode">Unicode</dfn> is the Universal character encoding standard, used for representing text for information processing. Unicode encodes all of the individual characters used for all the written languages of the world. The standards provide information about the character and their use. </p>
<p><dfn id="def_cldr">Common Locale Data Repository</dfn> is the largest standard repository of locale data in the world. It is managed by the Unicode Consortium. It provides locale data in an XML format for use in computer applications. It facilitates locale-related information sharing among applications regardless of their domains. Its goal is to provide basic linguistic information for diverse “locales” in an open, interoperable form.</p>
<p>This data is usable for localizing applications. </p>
<p>Some examples of the information that CLDR gathers for languages and territories are:</p>
<ul>
<li>Date formats </li>
<li>Time Zones </li>
<li>Number formats </li>
<li>Currency and its formats </li>
<li>Measurement Systems</li>
<li>Collation (Sort order) Specification: Sorting, Searching and Matching</li>
<li>Translations of names for language, territory, script, time zones, currencies </li>
<li>Script and exemplar characters used by a language</li>
<li>Calendaring rules, Formats and important dates.</li>
</ul>
<p class="note">Reference URL: [[!CLDR]]</p>
</section>
<section id="h_unicode_normalization">
<h4>Unicode Normalization</h4>
<p>Unicode normalization [[!UAX15]] is a form of text normalization that transforms equivalent sequences of characters into the same representation. Unicode normalization is important in Unicode text processing applications, because it affects the semantics of comparing, searching, and sorting Unicode sequences</p>
<p>When a unique representation is required , a normalized form of Unicode text can be used to eliminate unwanted distinctions. The key part of normalization is to provide a unique canonical order for visually non distinct sequences of combining characters.</p>
<section id="h_canonical_and_compatible_equivalence">
<h5>Canonical & Compatible Equivalence</h5>
<p>Unicode contains numerous characters to maintain compatibility with existing standards, some of which are functionally equivalent to other characters or sequences of characters. Because of this, Unicode defines some code point sequences as equivalent. Unicode provides two notions of equivalence: canonical and compatible.</p>
<p>Canonical equivalence is a form of equivalence that preserves visually and functionally equivalent characters.</p>
<p><a href="#fig_canonical_equivalence"></a> shows the canonical equivalence:</p>
<figure id="fig_canonical_equivalence"> <img src="images/can-eq.jpg" width="547" height="83" alt="Canonical equivalence in Hindi" />
<figcaption>Canonical Equivalence</figcaption>
</figure>
</section>
</section>
<section id="h_unicode_code_charts_devanagari">
<h4>Unicode Code charts – Devanagari & Devanagari Extended</h4>
<p>The following Unicode Character Code chart is per the Unicode Standard:</p>
<figure> <img src="images/code-chart.jpg" width="549" height="777" alt="Devanagari and Devanagari extended Code Chart" />
<figcaption>Unicode Devanagari and Devanagari extended Code Chart</figcaption>
</figure>
<p class="note">The latest version of Unicode online code charts are available at [[!Code-Charts]] . The charts cover all character content reference of the 12 scripts of Indian languages. These files contains an excerpt from the character code tables and list of character names for the latest version of Unicode Standard. </p>
</section>
</section>
</section>
<section id="h_indic_orthographic_syllable_boundaries">
<h2>Indic orthographic syllable boundaries</h2>
<section id="h_need_for_abnf_valid_segmentation">
<h3>Need for ABNF valid segmentation</h3>
<p>ABNF Valid Segmentation based Indic orthographic syllable definition is provided here for correct and standardized representation of Indian languages layout. This will address various issues mentioned in the following sections. </p>
<p>This definition will be useful in order to get the uniform display of Indic layout in the browsers, applications, Digital publishing etc.</p>
</section>
<section id="h_abnf_based_definition_of_indic_syllable">
<h3>ABNF based definition of Indic orthographic syllable</h3>
<p><b>V[m] |{CH}C[v][m]|CH </b></p>
<p>The linguistic definition of Indic orthographic syllable has been mapped to ABNF (Augmented Backus–Naur Form) for the purpose of text segmentation, line breaking , drop letter, letter spacing in horizontal text and vertical text representation. The definition has been elaborated , using examples from various Indic scripts in the table below.</p>
<p>The definition is a combination of 3 rules :</p>
<p>Rule 1 : V[m]</p>
<p>Rule 2 : {CH}C[v][m]</p>
<p>Rule 3 : CH (This rule is applicable only at the end of the word)</p>
V(upper case) is independent vowel<br />
<br />
m is modifier (Anusvara/Visarga/Chandrabindu)<br />
<br />
C is a consonant with inherent vowel which may or may not include a single nukta <br />
<br />
v (lower case) is any dependent vowel or vowel sign (mātrā)<br />
<br />
H is halant / virama<br />
<br />
| is a rule separator<br />
<br />
[ ] - The enclosed items is optional under this bracket<br />
<br />
{} - The enclosed item/items occurs zero or repeated multiple times </section>
<section id="h_abnf_syllabe_definition_use_cases_hindi">
<h3>Various example use cases of ABNF based Indic orthographic syllable definition for Indian languages</h3>
<p><strong>Rule 1 : V[m]</strong></p>
<div>
<table class="tab-format">
<tr>
<td><p> </p></td>
<td><p><strong>V (Vowel) is a syllable</strong></p></td>
<td><p><strong>V+ Modifier is a syllable</strong></p></td>
</tr>
<tr>
<td><p>Hindi </p></td>
<td><p>अ, ई, उ</p></td>
<td><p>अं, उँ, आः</p></td>
</tr>
<tr>
<td><p>Kannada</p></td>
<td><p>ಅ, ಇ</p></td>
<td><p>ಅಂ , ಅಃ </p></td>
</tr>
<tr>
<td><p>Tamil</p></td>
<td><p>அ, ஆ, இ</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Telugu</p></td>
<td><p>అ, ఇ,</p></td>
<td><p>అం , ఆః</p></td>
</tr>
<tr>
<td><p>Malayalam</p></td>
<td><p>അ, ഇ, ഉ</p></td>
<td><p>അം, അഃ</p></td>
</tr>
<tr>
<td><p>Bengali</p></td>
<td><p>অ , ই , ঋ</p></td>
<td><p>উঃ , এঁ, আঁ</p></td>
</tr>
<tr>
<td><p>Nepali</p></td>
<td><p>अ, आ, इ, उ</p></td>
<td><p>अँ, अं, उः</p></td>
</tr>
<tr>
<td><p>Manipuri language of Bengali script</p></td>
<td><p>অ, ই, উ</p></td>
<td><p>ওঁ, অং (হোয়)</p></td>
</tr>
<tr>
<td><p>Kashmiri language of Devanagari script</p></td>
<td><p>अ , ऑ ,ऒ ,ऎ </p></td>
<td><p>अँ</p></td>
</tr>
<tr>
<td><p>Maithili</p></td>
<td><p>अ, ई, उ </p></td>
<td><p>अं, उँ, आः </p></td>
</tr>
<tr>
<td><p>Dogri</p></td>
<td><p>अ, ई, उ</p></td>
<td><p>अं</p></td>
</tr>
<tr>
<td><p>Odia</p></td>
<td><p>ଅ, ଈ, ଉ</p></td>
<td><p>ଅଂ, ଉଁ, ଆଃ</p></td>
</tr>
<tr>
<td><p>Punjabi</p></td>
<td><p>ਅ, ਆ, ਇ</p></td>
<td><p>ਇੰ, ਉਂ</p></td>
</tr>
<tr>
<td><p>Sanskrit (Excluding Vedic Extensions)</p></td>
<td><p>अ, ई, उ</p></td>
<td><p>अं, उँ, आः</p></td>
</tr>
<tr>
<td><p>Marathi</p></td>
<td><p>अ, ई, ऐ</p></td>
<td><p>अं, उँ, आः</p></td>
</tr>
<tr>
<td><p>Assamese</p></td>
<td><p>অ , ই , ঈ</p></td>
<td><p>অঁ , অং, আঁ , ইঃ</p></td>
</tr>
<tr>
<td><p>Santhali language of Devanagari script</p></td>
<td><p>अ, ई, उ</p></td>
<td><p>अं, उँ, आः</p></td>
</tr>
<tr>
<td><p>Gujarati</p></td>
<td><p>અ, ઇ, ઈ</p></td>
<td><p>અં, અઃ</p></td>
</tr>
<tr>
<td><p>Konkani</p></td>
<td><p>अ, ई, उ</p></td>
<td><p>अं</p></td>
</tr>
<tr>
<td><p>Bodo</p></td>
<td><p>आ , ओ , ए , उ</p></td>
<td><p>ऐं , ऒं</p></td>
</tr>
<tr>
<td><p>Sindhi language of Devanagari script</p></td>
<td><p>अ , ऊ , ई , ऐ</p></td>
<td><p>ओं , एं , उं</p></td>
</tr>
</table>
<p><b>Rule 2 : {CH}C[v][m]</b></p>
<table class="tab-format">
<tr>
<td><p><strong> </strong></p></td>
<td><p><strong>Consonant is a syllable</strong></p></td>
<td><p><strong>Zero or more Consonant(Nukta *) + Virama sequences followed by consonant (+Nukta*)is a syllable</strong></p></td>
<td><p><strong>Zero or more consonant+ (Nukta*)+ virāma sequences followed by a consonant (+Nukta*) followed by a vowel sign is a syllable</strong></p></td>
<td><p><strong>zero or more consonant+ (Nukta*)+ virāma sequences followed by a consonant (+Nukta*) followed by modifier is a syllable</strong></p></td>
<td><p><strong>zero or more consonant+ (Nukta*)+ virāma sequences followed by a consonant (+Nukta*) followed by a vowel sign and modifier is a syllable</strong></p></td>
</tr>
<tr>
<td><p>Hindi </p></td>
<td><p>र, क, ज, ल, म</p></td>
<td><p>प्प, क्ख,च्त, ज्ज्व, त्क्ल, त्स्न , र्त्स्न्य, फ़्क</p></td>
<td><p>र्ता, र्त्स्न्या, फ़्जी, क्या, स्थि</p></td>
<td><p>तः,स्तं, स्त्रँ, स्तः, फ़्ज़ँ </p></td>
<td><p>र्त्स्न्या: त्स्न्युं, त्स्न्युँ, फ़्ज़ें,हि</p></td>
</tr>
<tr>
<td><p>Kannada</p></td>
<td><p>ರ, ಕ, ಜ, ಲ, ಮ</p></td>
<td><p>ಪ್ಪ, ಕ್ಖ,ಚ್ತ, ಜ್ಜ್ವ, ತ್ಕ್ಲ, ರ್ತ, ರ್ತ್ಸ, ರ್ತ್ಸ್ನ</p></td>
<td><p>ರ್ತಾ, ರ್ತ್ಸ್ನ್ಯಾ , ಖ್ವಾ</p></td>
<td><p>ತಃ, ಸ್ತಂ</p></td>
<td><p>ರ್ತ್ಸ್ನ್ಯಾಃ , ತ್ಸ್ನ್ಯುಂ</p></td>
</tr>
<tr>
<td><p>Tamil</p></td>
<td><p>க, ச, ங</p></td>
<td><p>க்ஷ</p></td>
<td><p>ஶ்ரீ , ஸ்ரீ , ரா</p></td>
<td><p>NA</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Telugu</p></td>
<td><p>ర, క, జ, ల</p></td>
<td><p>ప్ప, క్ఖ, చ్త,జ్జ్వ , ర్త్స్న , ర్త్స్న్య</p></td>
<td><p>ర్తా, ర్త్స్న్యా , ఖ్ఖి</p></td>
<td><p>తః, స్తం</p></td>
<td><p>క్కిం , ఖ్ఖిం , గ్గిం</p></td>
</tr>
<tr>
<td><p>Malayalam</p></td>
<td><p>ര, ക, ജ, ല, മ</p></td>
<td><p>പ്പ, ജ്ജ്വ, ത്സ, ക്ത</p></td>
<td><p>ക്ഷി, ത്തി, ത്സാ, ജ്ഞി , മ്മീ</p></td>
<td><p>നഃ, മഃ</p></td>
<td><p> ക്ലി , ത്തിം</p></td>
</tr>
<tr>
<td><p>Bengali</p></td>
<td><p>ক, ঙ, ঘ, ছ</p></td>
<td><p>ক্ক, ষ্ট, ষ্ণ, থ্র</p></td>
<td><p>ণ্যে, ন্ত্রে , গ্নে , গ্নী , ন্ত্রী </p></td>
<td><p>NA</p></td>
<td><p> স্যাঁ, ট্যাঁ, খ্রীঃ, ষ্টাং</p></td>
</tr>
<tr>
<td><p>Nepali</p></td>
<td><p>क छ ड भ </p></td>
<td><p>क्क क्ख ज्ज्व</p></td>
<td><p>र्पे , स्ति</p></td>
<td><p>तः स्त्रं</p></td>
<td><p>त्स्न्युँ</p></td>
</tr>
<tr>
<td><p>Manipuri language of Bengali script</p></td>
<td><p>ক, ল, ম, প </p></td>
<td><p>ন্দ, ক্ত , পৃ, র্জ্জ </p></td>
<td><p>র্তি, (পার্তি) , ঙ্থ্রৈ</p></td>
<td><p>ক্তং (খজিক্তং)</p></td>
<td><p>দাঃ, ন্দ্রাং, প্ত্রেং</p></td>
</tr>
<tr>
<td><p>Kashmiri language of Devanagari script</p></td>
<td><p>र, क, ज, ल</p></td>
<td><p>त्य, थ्व, च्य</p></td>
<td><p>न्यॊ, र्ता प्रा, क्या , प्रॉ</p></td>
<td><p>स्तं</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Maithili</p></td>
<td><p>र, क, ज</p></td>
<td><p>क्ख , न्ह, न्ध , फ़्क</p></td>
<td><p>र्ता, र्त्स्न्या, फ़्जी, क्या </p></td>
<td><p>तः,स्तं, स्त्रँ, स्तः</p></td>
<td><p>त्स्न्युं , त्स्न्युँ, फ़्ज़ें </p></td>
</tr>
<tr>
<td><p>Dogri</p></td>
<td><p>क, ज,स ,ल</p></td>
<td><p>ग्ग, द्ध , क्क</p></td>
<td><p>फ्ही , म्मी , ड़ि , क्का</p></td>
<td><p>जं , सं</p></td>
<td><p>यें , च्चैं , रें</p></td>
</tr>
<tr>
<td><p>Odia</p></td>
<td><p>କ, ଜ, ମ, ର, ଳ</p></td>
<td><p>କ୍କ, ଚ୍ଚ, ଟ୍ଟ, ଜ୍ଜ, ନ୍ନ , ଜ୍ଜ୍ୱ , ର୍ଣ୍ଣ, , ର୍ତ୍ସ </p></td>
<td><p>ର୍ତ୍ତା, ଜ୍ଞା, ଜ୍ଞୀ , ସ୍ଥି</p></td>
<td><p>ତଃ, ସ୍ତଂ, ସ୍ତଃ</p></td>
<td><p>ହିଂ</p></td>
</tr>
<tr>
<td><p>Punjabi</p></td>
<td><p>ਕ, ਜ, ਧ,ਵ</p></td>
<td><p>ਪ੍ਰ, ਕ੍ਰ , ਸ੍ਵ</p></td>
<td><p>ਨ੍ਹਾ,ਕੌ,ਹੋ</p></td>
<td><p>ਧੰ, ਯੱ, ਨੰ</p></td>
<td><p>ਮਾਂ, ਪੁੱ, ਚਿੱ </p></td>
</tr>
<tr>
<td><p>Sanskrit (Excluding Vedic Extensions)</p></td>
<td><p> ग,ड,प,र,ण</p></td>
<td><p>ल्म, त्य, ल्प</p></td>
<td><p>क्षे, र्था, यो , प्तो</p></td>
<td><p>तं ,न्तः, मः , प॑ , र॒</p></td>
<td><p>षाः, ताः, स्यां , दी॑ , हि॒</p></td>
</tr>
<tr>
<td><p>Marathi</p></td>
<td><p>ल, ष, ळ</p></td>
<td><p>स्व, क्ष्ण</p></td>
<td><p>व्या, त्स्ना</p></td>
<td><p>कं, स्पं </p></td>
<td><p>त्क्रां,त्र्यां </p></td>
</tr>
<tr>
<td><p>Assamese</p></td>
<td><p>ক , খ, ঘ</p></td>
<td><p>ন্ত্ৰ , ৰ্খ, ৰ্জ , ৰ্ট</p></td>
<td><p>ৰ্কে , ন্হা, ছ্ছা , ম্প্ৰ্দা</p></td>
<td><p>ৰ্নিং , ৰ্ণাং , ট্ৰাং , ৰ্কিং </p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Santhali language of Devanagari script</p></td>
<td><p>र, क, ज, ल, म</p></td>
<td><p>NA</p></td>
<td><p>र्ता, ड़ि</p></td>
<td><p>तः, कं , मः </p></td>
<td><p>ताः, रें</p></td>
</tr>
<tr>
<td><p>Gujarati</p></td>
<td><p>ર, ક , લ, મ</p></td>
<td><p>ક્ક, દ્ય, સ્ત્ર , ર્જ્જ, ર્પ્ક્ક</p></td>
<td><p>ર્તા,ર્ત્સ્ન્યા, ક્યા</p></td>
<td><p>તઃ, સ્તઃ</p></td>
<td><p>ર્ત્સ્ન્યાઃ, હિં</p></td>
</tr>
<tr>
<td><p>Konkani</p></td>
<td><p>ळ</p></td>
<td> <p>य, स्प, ल्म, स्थ , ल्ल्य</p></td>
<td><p>ज्यु, त्मे, स्त्री, स्तू, भ्रू</p></td>
<td><p>स्कं, स्थं, न्हं, द्वं </p></td>
<td><p>व्हां, म्हों, ल्लें, र्दें</p></td>
</tr>
<tr>
<td><p>Bodo</p></td>
<td><p>ब, फ, ख , ज</p></td>
<td><p>प्ता , ज्ज , ब्ला </p></td>
<td><p>ब्ला , यो , न्दो , न्थि</p></td>
<td><p>सं , रं , गं , न्थं</p></td>
<td><p>खां , दुं </p></td>
</tr>
<tr>
<td><p>Sindhi language of Devanagari script</p></td>
<td><p>क, घ , ज , ग</p></td>
<td><p>क्ट , ग्घ , फ्ख , स्त्र , च्ग़ , न्ज</p></td>
<td><p>बि , लू , यि , क्षी </p></td>
<td><p>धं , धृं , षं</p></td>
<td><p>हिं , सौं , श्रिं </p></td>
</tr>
</table>
<strong> * Nukta within a bracket is optional.Consonant may or may not include Nukta.</strong>
<p><b>Rule 3 : CH</b></p>
<p>This rule is applicable only for those Indian languages where pure consonant appears at the end of the word.</p>
<table class="tab-format">
<tr>
<td><p> </p></td>
<td><p>Examples of Rule3 - Consonant + virama at the end of the word</p></td>
</tr>
<tr>
<td><p>Hindi </p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Tamil</p></td>
<td><p>வணக்கம் , தமிழ் , எண்ணம், செயல்</p></td>
</tr>
<tr>
<td><p>Kannada</p></td>
<td><p>ಬ್ಯಾಂಕ್</p></td>
</tr>
<tr>
<td><p>Telugu</p></td>
<td><p>క్ , జ్ , ఞ్ </p></td>
</tr>
<tr>
<td><p>Malayalam</p></td>
<td><p>വാക്ക്, ചാക്ക് , നിനക്ക്</p></td>
</tr>
<tr>
<td><p>Bengali</p></td>
<td><p>ত্ (হঠাৎ) , This rule would not be applicable if ৎ is declared as pure consonant. </p></td>
</tr>
<tr>
<td><p>Nepali</p></td>
<td><p>छन्, हुन्, गर्दैनन्, गर्छस्</p></td>
</tr>
<tr>
<td><p>Manipuri language of Bengali script</p></td>
<td><p> খ্বাঙজেৎ</p></td>
</tr>
<tr>
<td><p>Kashmiri language of Devanagari script</p></td>
<td><p> NA</p></td>
</tr>
<tr>
<td><p>Maithili</p></td>
<td><p> NA</p></td>
</tr>
<tr>
<td><p>Dogri</p></td>
<td><p>राह् , ओह् </p></td>
</tr>
<tr>
<td><p>Odia</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Punjabi</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Sanskrit (Excluding Vedic Extensions)</p></td>
<td><p>तेजस्, मरुत् , माम्</p></td>
</tr>
<tr>
<td><p>Assamese</p></td>
<td><p>ত্ (হঠাৎ) , This rule would not be applicable if ৎ is declared as pure consonant. </p></td>
</tr>
<tr>
<td><p>Santhali language of Devanagari script</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Gujarati</p></td>
<td><p>આત્મસાત્</p></td>
</tr>
<tr>
<td><p>Konkani</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Bodo</p></td>
<td><p>NA</p></td>
</tr>
<tr>
<td><p>Sindhi language of Devanagari script</p></td>
<td><p>NA</p></td>
</tr>
</table>
</div>
</section>
</section>
<section id="h_text_segmentation">
<h2>Text segmentation</h2>
<p>A string of Unicode-encoded text often needs to be broken up into text elements programmatically. Common examples of text elements include what users think of as characters, words, lines (more precisely, where line breaks are allowed), and sentences. The precise determination of text elements may vary according to orthographic conventions for a given script or language. The goal of matching user perceptions cannot always be met exactly because the text alone does not always contain enough information to unambiguously decide boundaries. For example, the period (U+002E FULL STOP) is used ambiguously, sometimes for end-of-sentence purposes, sometimes for abbreviations, and sometimes for numbers. In most cases, however, programmatic text boundaries can match user perceptions quite closely, although sometimes the best that can be done is not to surprise the user. Word boundaries are used in a number of different contexts. The most familiar ones are selection (double-click mouse selection, or “move to next word” control-arrow keys), and “Whole Word Search” for search and replace. They are also used in database queries, to determine whether elements are within a certain number of words of one another .
Grapheme cluster boundaries are important for collation, regular expressions, UI interactions (such as mouse selection, arrow key movement, backspacing), segmentation for vertical text, identification of boundaries for Initial-letter styling, and counting “character” positions within text. [[!UAX29]]</p>
<section>
<h3>Word Boundaries</h3>
<p>Solution for word boundaries:<br />
User-percieved characters boundaries should be based on tailored Grapheme Cluster Boundaries to conform Indic orthographic syllable definition <br />
</p>
<p>In case of Devanagari phrase separator । , U+0964 DEVANAGARI DANDA, (called <i class="foreign">purna viram</i> in Hindi) and ॥ , U+0965 DEVANAGARI DOUBLE DANDA, (<i class="foreign">deergh viram</i> in Hindi) used to mark end of the verse as in Sanskrit text, shlokas etc.),In some of the browsers ending word is selected with <i class="foreign">purna viram</i> on double-click while in some browsers <i class="foreign">purna viram</i> is selected as a separate.So the properties of <i class="foreign">purna viram</i> and <i class="foreign">deergh viram</i> should be same as the properties of FullStop or other punctuation marks so that new line should not begin with <i class="foreign">purna viram</i> and <i class="foreign">deergh viram</i>.</p>
<p>For others characters, the text segmentation should be done as Indic orthographic syllable.</p></section>
<section>
<h3>Typographic units </h3>
<p>Indic script behavior in initial letter styling is based on syllables, rather than individual letter forms.</p>
<img src="images/drop-letter1.jpg" alt="example of drop letter"/>
<p>The above Figure shows an example of a drop intial in Hindi. In the first word of the paragraph, स्कूल ('skūl'), the sequence of characters is stored in memory is as follows:</p>
<img src="images/initial-letter-ex.jpg" alt="initial letter example" />
<p>There are two syllables in this word: SA+VIRAMA+KA+UU and LA. Note, however, that there are three Unicode grapheme clusters here: SA+VIRAMA, KA+UU and LA.</p>
<p>Styling is done on the basis of the whole orthographic syllable, not the first character, nor even the first grapheme. </p>
</section>
</section>
<section id="h_line_breaking">
<h2>Line breaking</h2>
<p>When inline-level content is laid out into lines, it is broken across line boxes. Such a break is called a line break. In most writing systems, in the absence of hyphenation a line break occurs only at word boundaries. Many writing systems use spaces or punctuation to explicitly separate words, and line break opportunities can be identified by these characters. Line breaking, also known as word wrapping, is the process of breaking a section of text into lines such that it will fit in the available width of a page, window or other display area. </p>
<section id="h_hyphenation">
<h3>Hyphenation</h3>
<p> There are different cases of hyphenation, some of the cases are given below :</p>
<p><b>Case 1 :</b> Hyphens are commonly used in Copulative compounds words in Hindi language. Hindi has both prefixes and suffixes which are joined to words with a hyphen.</p>
<p>नर-नारी, लाभ-हानि, माता-पिता, ऊंच-नीच</p>
Case 2:Single word can breaks at the end of the line follow Indic orthographic syllable using hyphen.Following example shows correct representation of word आकर्षण and विज्ञापन using hyphen :
<br />
<img src="images/untitled.jpg" width="537" height="227" alt="Example of Line breaking" /> </section>
<section id="h_guiding_principles_line_breaking_indian_languages">
<h3>Guiding principles of Line breaking for Indian languages</h3>
<p>In Indic writing system , it is preferred that line breaks at word boundaries ,if required following principles may be adhered :</p>
<p><b>Rule 1:</b> New line cannot begin with following symbols/Punctuation marks. Also these should be retain with the associated text</p>
<table class="tab-format">
<tr>
<td><p><strong>Symbols</strong> </p></td>
<td><p><strong>Character name</strong> </p></td>
<td><p><strong>Unicode code-point</strong> </p></td>
</tr>
<tr>
<td><p>। </p></td>
<td><p>DEVANAGARI DANDA </p></td>
<td><p>U + 0964 </p></td>
</tr>
<tr>
<td><p>॥ </p></td>
<td><p>DEVANAGARI DOUBLE DANDA </p></td>
<td><p>U + 0965 </p></td>
</tr>
<tr>
<td><p>) </p></td>
<td><p>RIGHT PARENTHESIS </p></td>
<td><p>U + 0029 </p></td>
</tr>
<tr>
<td><p>+ </p></td>
<td><p>PLUS SIGN </p></td>
<td><p>U + 002B </p></td>
</tr>
<tr>
<td><p>* </p></td>
<td><p>ASTERISK </p></td>
<td><p>U + 002A </p></td>
</tr>
<tr>
<td><p>- </p></td>
<td><p>HYPHENATIONPOINT-VISIBLE HYPHEN<br>
HYPHENATION-SOFT HYPHEN </p></td>
<td><p>U + 2027<br>
U+ 00AD </p></td>
</tr>
<tr>
<td><p>/ </p></td>
<td><p>SOLIDUS </p></td>
<td><p>U + 002F </p></td>
</tr>
<tr>
<td><p>, </p></td>
<td><p>COMMA </p></td>
<td><p>U + 002C </p></td>
</tr>
<tr>
<td><p>. </p></td>
<td><p>FULL STOP </p></td>
<td><p>U + 002E </p></td>
</tr>
<tr>
<td><p>: </p></td>
<td><p>COLON </p></td>
<td><p>U + 003A </p></td>
</tr>
<tr>
<td><p>; </p></td>
<td><p>SEMICOLON </p></td>
<td><p>U + 003B </p></td>
</tr>
<tr>
<td><p>= </p></td>
<td><p>EQUALS SIGN </p></td>
<td><p>U + 003D </p></td>
</tr>
<tr>
<td><p>> </p></td>
<td><p>GREATER-THAN SIGN </p></td>
<td><p>U + 003E </p></td>
</tr>
<tr>
<td><p>] </p></td>
<td><p>RIGHT SQUARE BRACKET </p></td>
<td><p>U + 005D </p></td>
</tr>
<tr>
<td><p>_ </p></td>
<td><p>LOW LINE </p></td>
<td><p>U + 005F </p></td>
</tr>
<tr>
<td><p>| </p></td>
<td><p>VERTICAL LINE </p></td>
<td><p>U + 007C </p></td>
</tr>
<tr>
<td><p>} </p></td>
<td><p>RIGHT CURLY BRACKET </p></td>
<td><p>U + 007D </p></td>
</tr>
<tr>
<td><p>~ </p></td>
<td><p>TILDE </p></td>
<td><p>U + 007E </p></td>
</tr>
<tr>
<td><p>% </p></td>
<td><p>PERCENT SIGN </p></td>
<td><p>U + 0025 </p></td>
</tr>
</table>
Rule 2: The definition of Indic orthographic syllable may be used to break the line and a hyphen should be at the breaking point so that word can be read intuitively
However the language specific morpho-phonemic rules and industry practices (from media, publishing and grammar books) could be used for hyphenation. U+ 00AD (soft hyphen) is used in some languages such as Tamil and Malayalam.
<p><b>Rule 3: </b>The hyphenated words can be broken at the hyphen e.g.:</p>
<ul>
<li>नर-नारी should be treated as:</li>
<li>नर- on the first line and नारी on the next line</li>
</ul>
<p><img src="images/hyphen1.jpg" alt="hyphenation example at line breaking" width="793" height="377" /></p>
<p><b>Rule 4:</b> Expression with mathematical symbol should be treated as single unit so that at the end of the line expression should not breaks at operator level</p>
<p><b>Rule 5:</b> Breaking should not be allowed at numerical values such as currency values, year etc. e.g.</p>
<p>“100.00” or “10,000”, nor in “12:59”</p>
</section>
</section>
<section id="h_initial_letter_styling">
<h3>Initial letter styling</h3>
<p>Drop initial is a typographic effect emphasizing the initial letter(s) of a block element with a presentation similar to a 'floated' element.</p>
<section id="h_selecting_initial_letters">
<h4>Selecting initial letters</h4>
<p>Initial letters in Indic scripts must be selected on the basis of orthographic syllables, rather than individual letter forms (see an example at the end of <a href="#h_text_segmentation"></a>). A detailed definition of Indic syllables can be found in <a href="#h_indic_orthographic_syllable_boundaries"></a>. In Indian languages the size of the initial letter is determined by the number of the lines between the top line of the syllable and the lowest bit in the orthographic Indic syllable cluster where subjoined consonant and other diacritics appear.</p></section>
<section id="h_typical_drop_initial_usage_in_indic_scripts">
<h4>Typical drop initial usage in Indic scripts</h4>
<p>Most of the Indic drop initial letters in magazines and newspapers use 2 to 4 line drops. Some examples are shown below.</p>
<figure> <img src="images/dropcap-example2.png" alt="examples of indic initial letters" />
<figcaption>Examples of Indic Initial letters</figcaption>
</figure>
<p>The <a href="https://www.w3.org/TR/dpub-latinreq/#raised-caps-and-sunken-caps">sunken and raised</a> initial letter are not preferred in Indian languages. In the above examples , reference points on the drop cap must align precisely with reference points in the text. .In Indic scripts the top reference point is the hanging base line for those scripts that have one, and the mean/median line for those that don't, and the bottom alignment point is the text after-edge</p>
<p><a href="https://www.w3.org/TR/css-inline-3/#initial-letter-wrapping">Initial letter wrap property</a> is not applicable for Indian languages. No <a href="https://www.w3.org/TR/css-inline-3/#initial-letter-wrapping">contour-filling</a> is required in Indian languages.</p>
<p>Alignment of the top line of the non-highlighted characters at the top of the thicker top line of the initial letter is common in India. In some examples the top lines of the initial letter and the following letters don't touch. This is due to variable technology/formats used by the publishers. It is preferred that both the top lines of Initial letter and neighbouring text should touch.</p>
<p>Here are some additional examples of initial highlighted letter and drop letter based on the Indic syllable definition.</p>
<p class="exampleList"><img src="images/I-letter1.png" alt="Bengali example" /></p>
<p class="exampleList"><img src="images/I-letter2.png" alt="Punjabi example" /></p>
<p class="exampleList"><img src="images/I-letter3.png" alt="Tamil example"/></p>
<p class="exampleList"><img src="images/I-letter5.png" alt="Malayalam example"/></p>
<p class="exampleList"><img src="images/I-letter4.png" alt="Odia example"/></p>
<p class="exampleList"><img src="images/I-letter6.png" alt="Marathi example" width="555" height="255"/></p>