\documentclass[a4paper,11pt]{report}

\addtolength{\topmargin}{-1in}

\setlength{\textwidth}{6.0in}

\setlength{\textheight}{9.5in}

\addtolength{\oddsidemargin}{-0.7in}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\usepackage[english]{babel}

\usepackage{graphicx}

\usepackage{color}

\usepackage{amssymb}

%\usepackage[dvips]{changebar}

%\renewcommand{\baselinestretch}{1.5}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newenvironment{code}
	       {
		 \renewcommand{\baselinestretch}{1}
		 \footnotesize
		 \begin{quote}
	       }
	       {
		 \end{quote}
		 \normalsize
		 \renewcommand{\baselinestretch}{1.5}
	       }
\newenvironment{tblenv}
	       {
		 \renewcommand{\baselinestretch}{1}
		 \begin{figure}[h]
		   \begin{center}
		     \footnotesize
	       }
	       {
		 \normalsize
		   \end{center}
		 \end{figure}
		 \renewcommand{\baselinestretch}{1.5}
	       }
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\book}[4]{#1. {\em #2}. #3, #4.}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\lt}{$<$}

\newcommand{\gt}{$>$}

\newcommand{\back}{$\backslash$}

\newcommand{\home}{\~{}}

\newcommand{\da}{$\doublearrow$}

\newcommand{\dad}{$\rightarrow$}

\newcommand{\underlines}{\underline{\ }\ \underline{\ }\ \underline{\ 
}\ }

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\lz}{$l_0$}

\newcommand{\lo}{$l_1$}

\newcommand{\uz}{$u_0$}

\newcommand{\uo}{$u_1$}

\newcommand{\ut}{$u_2$}

\newcommand{\Tz}{$\mathsf{T_0}$}

\newcommand{\To}{$\mathsf{T_1}$}

\newcommand{\Tt}{$\mathsf{T_2}$}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\lispt}{{\tt t}}

\newcommand{\lispnil}{{\tt nil}}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\lu}[1]{{\sffamily #1}}

\newcommand{\luele}{\lu{Element}}

\newcommand{\luheader}{\lu{Header}}

\newcommand{\luelename}{\lu{Element~name}}

\newcommand{\luattlist}{\lu{Attribute~list}}

\newcommand{\luatt}{\lu{Attribute}}

\newcommand{\luattname}{\lu{Attribute~name}}

\newcommand{\luattvalue}{\lu{Attribute~value}}

\newcommand{\luchardata}{\lu{CharData}}

\newcommand{\lupi}{\lu{Processing~Instruction}}

\newcommand{\lupitarget}{\lu{Processing~Instruction~Target}}

\newcommand{\lupibody}{\lu{Processing~Instruction~Body}}

\newcommand{\lucomment}{\lu{Comment}}

\newcommand{\luintdtd}{\lu{Internal~DTD}}

\newcommand{\luentref}{\lu{Entity~reference}}

\newcommand{\logu}{logical unit}

\newcommand{\logus}{logical units}

\newcommand{\Logu}{Logical unit}

\newcommand{\Logus}{Logical units}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\logl}{logical line}

\newcommand{\ebuf}{Ebuffer}

\newcommand{\etree}{Etree}

\newcommand{\traveller}{climbing transparency}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\key}[1]{\framebox{#1}}

\newcommand{\kl}{\key{$\leftarrow$}}

\newcommand{\kr}{\key{$\rightarrow$}}

\newcommand{\ku}{\key{$\uparrow$}}

\newcommand{\kd}{\key{$\downarrow$}}

\newcommand{\ra}{$\rightarrow$}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\newcommand{\C}{\color{red}}

\newcommand{\todo}[1]{{\color{blue} \Large {\bf TODO:} \normalsize
#1}}

\newcommand{\todoweb}[1]{{\color{blue} \Large {\bf TODOWEB:}
\normalsize #1}}

\newcommand{\jbw}[1]{{\color{blue} \Large {\bf JBW:} \normalsize
#1}}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -


\renewcommand{\baselinestretch}{1.5}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

\title{Emaxml {\Large }\\ \textbf{\large An Emacs mode for editing
XML}\large }

\author{Paolo Debetto\\ Supervisor: Dr. Joe Wells}

\date{CS4 Dissertation\\ Deliverable 2}

\begin{document}

\maketitle
\newpage
\tableofcontents

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{quote}
  Emaxml is an extension of Emacs, written in Emacs Lisp, to edit XML
  documents. Major Emacs modes for editing SGML and XML already exist;
  this is different in that it allows viewing the document as a tree
  structure, both visually and logically.
\end{quote}
	
%================================================================
\section{\C Aims and objectives of the project}\label{sec:AimsAndObjectives}
%================================================================

{\bf [}An XML document is often generated automatically by an
application. Nevertheless, in many occasions XML code is edited
directly by a human author. When a normal text editor (i.e. one
with no XML-specific editing facilities) is used to this end, the
author's creativity has to deal with the XML document at three
levels:

\begin{enumerate}

	\item At the {\bf contents level}, the author is concerned
	with what the document is about, the actual information or
	concepts.

	\item At the {\bf structure level}, the author organises the
	document hierarchically, according to the rules set by the DTD
	for that particular class of documents.

	For non-trivial documents the overhead activity involved with
	keeping the structure in order or with changing the current
	structure can be very expensive.

	Moreover, the author has to be concerned with indentation or
	some other means to visually see the structure of the
	document.

	However, this activity is related to the conceptual contents
	of the document.

	\item At the {\bf syntactic level}, the author is concerned
	with getting the XML syntactic sugar right. This activity is
	strictly XML-related and has nothing to do with the topic of
	the document. It is an error-prone activity and the overhead
	involved can be very expensive.

\end{enumerate}

Obviously most of the work mentioned in the previous section can
be automated to various degrees by an editor with XML editing
facilities, to the purpose of letting the author concentrating on
the contents and the structure of the document abstractly.

The approach of Emaxml is that of taking care of the XML syntax
and providing means of seeing and manipulating the structure of
the document effectively, by displaying the document in a
tree-like fashion.

Figure~\ref{fig:Emaxml} shows Emaxml at work.

{\C The final concrete objective is to implement a fully functional
Emacs mode, with a limited number of functionalities, but designed so
that it can easily be limitlessly improved by anyone who might
possibly want to work on it later.}

%================================================================
\section{\C Fundamental concepts of Emaxml}
%================================================================

This chapter is a brief reminder of the concepts central to Emaxml
specification, detailed in Deliverable 1\footnote{The terms in
{\bf bold} are specific to Emaxml.}.

Figure~\ref{fig:Emaxml} shows an XML document displayed by Emaxml. The
parts of the display are called {\bf logical units}; their properties
influence the response of Emaxml to the user's input. A list of the
types of \logus\ is in Appendix~\ref{app:thelogus}.

Not all locations in the display belong to a \logu. For instance, the
colored spaces at the left of an \luelename\ or the colon following an
\luattname\ are not part of any logical unit. All such locations and
their contents are said to be {\bf automatic}, because they are
managed by Emaxml and cannot be reached by the user. Non-automatic
locations of the buffer form the {\bf user space} of the buffer. The
user may place the cursor on some automatic locations, namely where
such a location is immediately on the right of a user location. These
locations are therefore not completely automatic, are called {\bf
ubiquitous}, and allow the insertion of new text. For example, the
colon in an \luatt\ is ubiquitous and when the cursor is over it any
text inserted is appended to the relative \luattname.


A {\bf logical line} is either one line (up to a newline) of a
multiline \logu\ or an entire monoline \logu.


The {\bf previous} \logl\ with respect to a \logl\ is the first \logl\
that is encountered by going left and up {in the display}.  The {\bf
following} \logl\ with respect to a \logl\ is the \logl\ that is
encountered by going right and down.

The topmost element is called the {\bf seed
element}\footnote{Because it comes before the root element...},
and does not correspond to an actual XML element; it contains
information relative to the document, such as that contained in
the XML declaration. From the user's point of view, the header of
the seed element is treated as a normal header, whose attributes
are the said information.

\begin{figure}[h]
\begin{center}
\includegraphics[width=16cm]{../deliv1/fig-Emaxml_lu-16x20.eps}
\end{center}
\caption{Screenshot of Emaxml, with the indication of the Logical
Units.}
\label{fig:Emaxml}
\end{figure}

An element (and the relative subtree rooted at it) can be
displayed {\bf outline} or {\bf inline} (that is, vertically or
horizontally) and {\bf expanded} or {\bf collapsed} (that is,
completely visible or displayed as the element name only).

These characteristics are independent so there are four ways of
displaying a subtree (see Fig.~\ref{fig:ilol}), called {\bf display
modes}: outline-expanded, outline-collapsed, inline-expanded,
inline-collapsed.

\begin{figure}[h]
\begin{center}
\includegraphics[width=18cm]{../deliv1/fig-tree_views-24x12.eps}
\end{center}
\caption{Display modes}
\label{fig:ilol}
\end{figure}

The {\bf display state} of a subtree is defined by the display
mode of all the elements it is formed by.

%================================================================
\section{Changes to the original project plan}
%================================================================

The most important change to the specification of Emaxml as
described in Deliverable 1 regards the set of \logus\, which has
been modified as follows:

\begin{itemize}

\item \lu{CDATA} \logus\ have been eliminated.

  A CDATA section in an XML document serves the purpose of letting the
  author write any sequence of characters without having to ``escape''
  them as entity references.

  In the previous specification, Emaxml had a \logu\ for CDATA
  sections. The change is that now the author writes whatever s/he
  wants as character data, and the Writer will codify the character
  data optimally using combinations of CDATA sections, entity
  references, character references and plain character
  data.

\item \luentref\ \logus\ have been introduced.

  Entity references are a sort of macro facility used in XML. They are
  substituted with the text they refer to when the specific
  application processes the information contained in the XML file.

  An \luentref\ \logu\ is an elementary, special, monoline, primary
  \logu, according to the definitions of these properties given in
  Appendix~\ref{app:thelogus}.

\end{itemize}

This changes have affected the design of the Parser, the Writer and
the Emaxml mode; in particular, the hierarchy of types (the XD data
model) that governs the main data structure (the \etree) has been
changed accordingly.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Functional specification of the system}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The expected functionality of the system has been set in
Deliverable~1; what follows is a revision of it, according to the
feedback from the readers and discussion with the project supervisor.

The functionalities are divided in two main categories: standard Emacs
operations revisited for Emaxml, and Emaxml-specific operations.

%================================================================
\section{Standard Emacs editing operations {\em \`{a} la} Emaxml}\label{sec:edop}
%================================================================

The typical Emacs user will expect a number of standard editing
facilities to be available in Emaxml mode, and their behavior to
parallel that of other modes.

It is not as important to implement a large number of functionalities
immediately as it is to design the code in a modular fashion and
document it properly, and choose at least a few operations from each
of the following categories:

\begin{itemize}

\item {\bf PM}: point movement;

\item {\bf ID}: insertion and deletion;

\item {\bf MK}: mark operations;

\item {\bf RE}: region setting;

\item {\bf SR}: search \& replacement;

\item {\bf GE}: general common operations (e.g. 'undo').

\end{itemize}

Appendix~\ref{app:stdcmd} lists a set of commands that may be
implemented for Emaxml as part of this project.

%------------------------------------------------------------
\subsection{Visiting a file}
%------------------------------------------------------------

Emaxml mode will be activated on visiting a file with extension
{\em .xml} and when a buffer containing a file with extension
other than {\em .xml} is saved with extension {\em
.xml}\footnote{In Emacs, if a buffer is saved with 'C-x C-w'
('Save Buffer As...' in the Files menu) with a different
extension, its major mode changes accordingly.}.

In the latter case, the buffer needs be re-displayed also.

In case the file does not already exist, the buffer will contain the
seed element, blank, as described in section \ref{sec:create}.

%------------------------------------------------------------
\subsection{Saving a file}\label{sec:savingAFile}
%------------------------------------------------------------

When a file needs saving, the Writer is invoked to produce the XML
code from the internal representation of the document.

Also, the \luintdtd\ is syntactically checked before the file is
saved, because it may contain sequences of characters that would
make the written file illegal\footnote{Consider for example an
\luintdtd\ containing {\tt ...]]> <root-element>...}. If this is
written without being checked, the file will be subsequently
unparsable.}. In the case that the \luintdtd\ is recognised as
incorrect, the file will be saved without it, and the user will be
advised so that s/he can take whichever action s/he thinks
appropriate.

%------------------------------------------------------------
\subsection{Highlighting the region} \label{sec:high}
%------------------------------------------------------------

When using the mouse to highlight the region, or in Transient Mark
mode\footnote{In Transient Mark mode, when the mark is active, the
region is highlighted.}, only the locations of user space included in
the region will be highlighted.

However, when a primary \logu\ is completely included in the highlighted
region, all of its physical space will be colored, including the
non-user space, otherwise only the included user space will be
colored.

Two useful commands are implemented in Emaxml: mark-more and
mark-less.

\begin{description}

\item[Mark more] is to expand the region to the next bigger \logu\
  containing the region (or containing point if mark is undefined).

\item[Mark less] does the opposite. Calls to mark-less always select
the first child when there is more than one.

\end{description}

For example, if the region is currently a string in an \luattvalue,
mark-more sets region to the containing \luatt. Further calls to
mark-more set region to the containing \luattlist, then to the
\luheader, then to the \luele, and so on up to the entire
buffer. 

%------------------------------------------------------------
\subsection{Text search and replacement}
%------------------------------------------------------------

Text search and replacement in Emaxml will operate on the user
space only.  Both string and regular expression search and
replacement will be implemented.

%------------------------------------------------------------
\subsection{Moving point} \label{sec:move}
%------------------------------------------------------------

This section has not changed from Deliverable~1.


%% Movement in Emaxml acquires different meanings (and names)
%% depending on which level it is performed at:

%% \begin{itemize}

%% \item At character level: {\bf sliding}.

%%   \begin{description}

%%   \item[Horizontal sliding]: moving by one character.

%%     Point can slide inside the current \logl\ one character backward
%%     or forward.

%%     Sliding point backward when at the beginning of a \logl\ moves it
%%     to the end of the previous \logl.

%%     Sliding point forward when at the end of a \logl\ moves it to the
%%     beginning of the previous \logl.

%%   \item[Vertical sliding]: moving by one \logl.

%%     When performing a vertical sliding movement (e.g. 'next-line'),
%%     point is expected to behave as closely as possible to the usual
%%     behavior.

%%     A movement that starts at the $l$th location of a \logl, and ends
%%     at a different \logl, will end at the $l$th location of the
%%     arrival \logl\ if that has at least $l$ characters, or at its last
%%     location otherwise.  However, $l$ is remembered for successive
%%     vertical movements.  This Emacs standard property I will call {\bf
%%     \traveller}, because it is proved for an editor by demonstrating
%%     that performing a vertical movement in one direction followed
%%     immediately by a vertical movement in the opposite direction,
%%     point will always return to the initial location.

%%     Point can slide one \logl\ up or down with \traveller.

%%   \end{description}

%% \item At the logical level: {\bf traversing}.

%%   Traversing (the XML tree) refers to moving from one XML
%%   component to another, hence the \logus\ involved are the primary \logus.

%%   Traversing will be implemented with no \traveller; point is
%%   always moved to the first character of the arrival \logu.

%%   \begin{description}

%%   \item[Horizontal traversing]: moving hierarchically\footnote{The
%%     choice of adjectives {\em horizontal} and {\em vertical} for
%%     traversing is due to the fact that, according to the general idea
%%     on which the layout style of Emaxml is based, the parent of a
%%     \logu\ can be thought of as being ``on the left'', while a
%%     peer is up or down.}.

%%     Point can traverse left from one of the said \logus\ to its XML
%%     parent.

%%     Traversing right can only be done from a \luheader\ to the first
%%     child of the current element, if any.

%%   \item[Vertical traversing]: moving to peers.

%%     Point can traverse from one of the said \logus\ (except
%%     the seed element, which has no peers) to the next or
%%     previous instance of the same \logu\ found in the buffer,
%%     regardless of them being sibling.

%%   \end{description}

%% \end{itemize}


%------------------------------------------------------------
\subsection{Mark and the mark ring}
%------------------------------------------------------------

The mark ring will be implemented in Emaxml, consistently with its
standard definition and functionality. Obviously, the internals
relative to such implementation will be specific to Emaxml;
markers have to be objects of a different data structure, since
they must describe a location in terms of the tree.

An important property of a marker that must be preserved is that
it ``moves'' with the position in the buffer to which it is
pointing, i.e. if some text is added or deleted before that
position, the marker always points to the same character.

%------------------------------------------------------------
\subsection{Killing and yanking}
%------------------------------------------------------------

{\em Killing} and {\em yanking} in Emacs jargon mean {\em cutting}
and {\em pasting} respectively.

In standard Emacs operation when a portion of the buffer is killed it
is deleted from the buffer and stored in the kill ring for later use.
One property (I shall call it {\bf yanking transparency}) of an Emacs
buffer is that if some text is killed and immediately yanked, the
buffer does not change. Point may be in a different location, though.

Yanking is an insertion operation; the portion of buffer being
yanked is inserted before point.

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{Killing in Emaxml} \label{sec:kill}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Many Emacs commands exist for killing. The ones considered here are:
kill-line, kill-region, kill-word, kill-sexp, kill-buffer. Their names
are self-explanatory, (a part from kill-sexp, maybe, which in Emaxml
kills the current element). Kill-line kills a \logl\
instead of a normal line.

Since the object of the editing in Emaxml is a tree, the killing
and yanking operations must be redefined in terms of \logus.

Let us consider a portion $p$ of an Emaxml buffer being killed
that starts from location \lz\ in primary \logu\ \uz\ and ends at
location \lo\ in primary \logu\ \uo.

When $p$ is killed, it is stored in the kill ring.

If $p$ is a string, it is stored as such, whith no information
regarding wich \logu\ it was part of. On the other hand, Emaxml stores
killed \logus\ and il\logus\ as {\em trees}.

This stored tree is:

\begin{itemize}

\item If $p$ is an entire \logu, the tree rooted at \uz, including all
  its children.

\item If $p$ is an il\logu, the tree rooted at the smallest \logu\ $s$
  that include all the \logus\ totally or partially in $p$, and that
  does not include the children of $s$ which are not in $p$ and the
  portions of user space of $s$ not in $p$.

  This implies that the stored tree may have some blank \logus\ in it.

\end{itemize}

Killing $p$ has the following effects:

\begin{itemize}

\item[(1)] If $p$ is a string, point is left at \lz.

\item[(2)] If $p$ is a primary \logu\, then point is left:

  \begin{itemize}

  \item if $p$ has a peer $q$ below it, at the first character of
    $q$'s user space;

  \item otherwise if $p$ has a peer $q$ above it, at the first
    character of $q$'s user space;

  \item otherwise at the first character of $p$'s parent's user space.

  \end{itemize}

\item[(3)] If $p$ is a secondary \logu, then it must be one of the
  elementary \logus\ in an \luele\ or in a \lupi; a blank \logu\ of
  the same type as $p$ is inserted in place of $p$ with point at its
  first location.

\item[(4)] If $p$ is an il\logu, then point is left as in policies
  (1), (2), (3) but substituting ``the portion of \uz\ in $p$'' for
  $p$.

  When killing an il\logu, Emaxml rebuilds the tree as follows, in
  order:

  \begin{itemize}

  \item[(i)] all primary \logus\ entirely included in $p$ are pruned;

  \item[(ii)] all secondary \logus\ entirely included in $p$ are
    blanked and possibly removed;

  \item[(iii)] all characters in $p$ are eliminated.

  \end{itemize}

\end{itemize}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{\C Yanking in Emaxml} \label{sec:yank}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

The following are the main general properties of yanking in Emaxml:

\begin{itemize}

\item The object $p$ to be yanked is either a string or a subtree in
  the kill ring.

\item Yanking leaves point after the last location of the user space
  of the yanked unit, i.e. the cursor is left under the first
  character of the user space after the yanked unit.

\item If $p$ is a string, it is yanked at point. This involves
applying low-level control\footnote{See
section~\ref{sec:lowlevelcontrol} for definition of low-level
control.}.

\end{itemize}

Yanking a non-string unit involves somehow tree manipulation. The only
limitation posed by Emaxml is that secondary \logus\ (such as
\luattname\ or \lupitarget) cannot be yanked other than in an
appropriate compound \logu\footnote{\C This could be improved in future by giving
a meaning to such operations. For example, yanking a secondary
\logu\ $s$ of type $t$ into an unappropriate primary \logu\ $u$ may be defined as
inserting a new appropriate primary \logu\ as a sibling of $p$, blank
but with the field $t$ equal to $s$.}.

Since both logical and illogical units are complete subtrees, there is
no difference in yanking them.

Let us consider a unit $p$ stored by Emaxml in the kill ring.

If $p$ is a primary \logu, then yanking transparency does not hold
\footnote{See Deliverable 1 for a discussion of this.}, so
$p$ must be explicitly yanked as a peer or as a child. If yanked as a
peer it is inserted before the current primary \logu, if yanked as a child it
is inserted as its last child.

The Emaxml policies for yanking $p$ into some \logu\ \ut\ are
described in Fig.~\ref{fig:yankingcriteria}.

\begin{tblenv}
  \begin{tabular}{l|l|p{5cm}}
    
    \hline
    
	{\bf $p$ is a} & {\bf \ut\ is a} & {\bf Yanking $p$ into \ut} \\

	\hline

	string & elementary \logu & $p$ can be yanked anywhere in \ut
	\\

	primary \logu & \luheader & $p$ can be yanked either as child
	or as peer of the element which \ut\ belongs to\\

	primary \logu & primary \logu & $p$ yanked as a peer of \ut \\

	\luattlist\ or \luatt & \luheader & $p$ inserted at end of
	\ut's \luattlist \\

	\hline

\end{tabular}
\caption{Yanking criteria for $p$ not illogical.}\label{fig:yankingcriteria}
\end{tblenv}

%================================================================
\section{Emaxml specialized editing operations}
%================================================================

Appendix~\ref{app:newcmd} lists the minimum set of new commands
specific to Emaxml to be implemented as part of this project.  They
are described in the following sections.

%------------------------------------------------------------
\subsection{Creating a new instance of a logical unit}\label{sec:create}
%------------------------------------------------------------

Creating a new instance inserts a {\bf blank \logu}. A blank \logu\ is
an instance of a \logu\ with the strings referring to the user space
empty. For example, an \luatt\ is of the form:

$$
\overbrace{\mbox{\tt \underline{name}}}^\mathcal{U}
\underbrace{\mbox{\tt :}\Box}_{\mathcal{A}}
\overbrace{\mbox{\tt value}}^{\mathcal{U}}
$$

where `$\Box$' indicates a space, `$\mathcal{U}$' indicates user space
and `$\mathcal{A}$' indicates automatic/ubiquitous space. A blank
\luatt\ is therefore displayed as a colon followed by a space (which
cannot be seen) and represented internally\footnote{See
section~\ref{sec:DataStructures} for details on internal
representation} as

\begin{code}
\begin{verbatim}
(attribute (attName "") (attValue ""))
\end{verbatim}
\end{code}

In the example, the user is able to move with the cursor over the
colon and insert characters which are taken as part of the attribute
name, or to place the cursor after the space to insert characters of
the attribute value.

Specifically, the blank \logus\ are described in terms of both displaying
and internal representation in Fig.~\ref{tbl:blanklogus}.

\begin{tblenv}
\begin{tabular}{|p{3cm}|p{4cm}|p{8cm}|}

  \hline

      {\bf \logu} & {\bf Display} & {\bf Internal representation} \\

      \hline
      \hline

      \luattname & & {\tt (attName "")} \\

      \hline

      \luattvalue & & {\tt (attValue "")} \\

      \hline

      \luatt & {\tt :$\Box$} & {\tt (attribute (attName "") (attValue
      ""))} \\

      \hline

      \luattlist & {\tt :$\Box$} & {\tt (attList (attribute (attName "") (attValue
      "")))} \\

      \hline

      \luelename & $\Box$ & {\tt (eleName "")} \\

      \hline
      
      \luheader & \underline{$\Box$}\underlines & {\tt (header
      (eleName ""))} \\

      \hline

      \luele & \underline{$\Box$}\underlines & {\tt (element (header (eleName
      "")))} \\

      \hline

      \luchardata & Three blank lines (one as top ``margin'', one for
      the text and one as bottom ``margin'') & {\tt (charData "")} \\

      \hline
      
      \lupitarget & & {\tt (PITarget "")} \\

      \hline

      \lupibody & & {\tt (PIBody "")} \\

      \hline

      \lupi & \underline{\tt $<$?$\Box$:$\Box$}\underlines & {\tt (PI
      (PITarget "") (PIBody ""))} \\

      \hline
   
      \lucomment & \underline{\tt $<$!--}$\Box$ & {\tt (comment "")}
      \\

      \hline

      \luintdtd & \underline{\tt $\Box$[}$\Box$ & {\tt (intDTD "")}
      \\

      \hline

      \luentref & \underline{\tt $\Box$\&$\Box\Box$;$\Box$}
      & {\tt (entRef "")} \\

      \hline


\end{tabular}
\caption{Blank \logus. `$\Box$' indicates a space, underlined text
  indicates a colored background.}
\label{tbl:blanklogus}
\end{tblenv}

When a new \logu\ is created, point is placed at its first ubiquitous
location, since there are no available user locations in a blank
\logu.

Inserting a new \logu\ does not always make sense, for example
creating a new \luattname\ alone, or creating a \lupibody\ when the
cursor is on the middle of an \luentref. The meaning of such commands
must be interpreted guessing what the user may be wanting when s/he
issues them. That depends on where point is and what type of \logu\
the user wants to create. The following is the set of rules that
governs the insertion of new \logus.

\begin{itemize}

%A new child is inserted as the first child of the element, if point is
%on the \luheader\ of the element, while a
%sibling is inserted in place of the primary component.


%\item Creating a primary \logu\ results in a new instance of such a
%  \logu\ being inserted as a sibling of the current primary, immediately
%  before that, unless the current primary is the \luseed, in which case
%  the new \logu\ is inserted after the seed (if such a \logu\ is
%  allowed, see Fig.~\ref{tbl:XD:types}).

\item A primary \logu\ can be created as a child of an element or as a
sibling of the current primary \logu. Different commands are provided
for these two operations.

\item Creating a \luattlist, a \luattname\ or a \luattvalue\ is
equivalent to creating an \luatt.

\item Creating a \lupitarget\ or a \lupibody\ is equivalent to
  creating a \lupi.

\end{itemize}

%------------------------------------------------------------
\subsection{Adjusting the displaying of the tree structure}
%------------------------------------------------------------

This section has not changed from Deliverable~1.

%% The following statements define the manipulation of the visual
%% tree:

%% \begin{itemize}

%% 	\item The tree structure is by default displayed entirely
%% 	outline-expanded when the document is initially visited.

%% 	\item Making an outline subtree inline makes all its children
%% 	temporarily inline, and does not change its or its children's
%% 	expansion mode.

%% 	\item Expanding a collapsed subtree brings it back to the
%% 	display status it was before being collapsed (i.e. all its
%% 	elements return to their previous display mode).

%% 	\item The root element and the seed cannot be made
%% 	inline.

%% 	\item The \luheader\ of a collapsed or inline element has no
%% 	user space.

%% \end{itemize}

%% An optional further development may be that the entire document
%% display status be saved along with the file (e.g. encoded somehow
%% inside the document) and restored the next time the document is
%% visited in Emaxml mode.

%================================================================
\section{Control issues} \label{sec:ctrl}
%================================================================

Emaxml, at the stage of development I set as target for my
project, will not enforce control over the tree structure created
by the user or read from an XML file in terms of the DTD.

Emaxml will see the document ``simply'' as a syntactic structure
that must comply with the grammar set out by the BNF rules in
\cite{w3c}. Note that the internal DTD is {\em not} parsed. The
user will be able to manipulate the tree as s/he wants without
being bothered with indentation. The ``less-thans'', quotes, and
other syntactic sugar of XML will be hidden. This should hopefully
let the user concentrate more on the contents, but, on the other
end, the user will also be able to create documents that are not
well-formed, or invalid.

However, the actual code implementing the editing mode must be
designed so that adding semantic awareness and control should be
easy.

Examples of events that may trigger a contents check or other
semantics-related operations are:

\begin{itemize}

	\item the contents of a secondary elementary \logu\
	(i.e. \luelename, \luattname, \luattvalue) is changed at the
	character level;

	\item point leaves an elementary \logu.

	\item the tree is changed; this includes creation of instances
	of any \logu, killing and yanking at the logical and illogical
	levels,

	\item the contents of the seed element are changed;

	\item the contents of the \luintdtd\ are changed.

\end{itemize}

These and other such events must be easily recognizable and
exploitable from the programmer's point of view.

%------------------------------------------------------------
\subsection{Low-level control}\label{sec:lowlevelcontrol}
%------------------------------------------------------------

Some \logus, by their nature, have limitations on what can be inserted
in them. For instance, no spaces may be inserted in a \lupitarget,
since the target of a processing instruction can only be one word.

Emaxml enforces low-level control on the text inserted by the user, by
matching the contents of the current elementary \logu\ with a suitable
regular expression.

By {\em low-level control} I mean that only the wellformedness is
checked, as opposed to checking the validity as well, which would
involve ensuring that what the user inserts is also consistent with
the DTD declarations (for instance, that a referenced entity has been
declared in the DTD).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Background: Emacs buffer-related technologies}\label{sec:backgroundBuffer}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A buffer in Emacs is a data structure that contains the information
needed to display something in a window, according to the mode the
buffer is set to. For example, a buffer in Text mode contains simply a
sequence of ASCII\footnote{Or Unicode, or ...} characters, while a
buffer in Enriched mode contains also information about
characteristics of portion of text, such as the color, the
weight, the justification, etc. A buffer in Enriched mode, when saved,
may look like this:

\begin{code}
\begin{verbatim}
Content-Type: text/enriched
Text-Width: 70

This text is normal.

<x-color><param>red</param>This text is red.</x-color>
<bold>This text is bold.</bold>
\end{verbatim}
\end{code}

This will produce a line of normal text, one of red text and one of
bold text. Information is in this case stored as markup {\em in the
buffer}, and it is saved in the file. 

Enriched mode is the simplest way of controlling the layout of text in
a window. In sections more powerful approaches are
examined. These sections are mainly a resume from the Emacs Lisp
manual (\cite{elisp}).

%================================================================
\section{Text properties}
%================================================================

Each character position in a buffer or a string can be associated to a
{\em text property list}. Each property has a name and a value, and there
exist Lisp functions to access and modify them.

Copying text between strings and buffers preserves the properties
along with the characters; this includes such diverse functions as
`substring', `insert', and `buffer-substring'.

There are many predefined text properties, not only related to layout,
but also to the behavior of Emacs related to a character
position. Fig.~\ref{tbl:textproperties} is a list of some text
properties that may be useful in designing Emaxml.

New text properties can be created and managed with the relative
built-in functions. For example, in Emaxml may be useful to attach to
a portion of the buffer a property called `subtree' in which to store
the corresponding portion of the \etree, in order to associate the
logical and the physical view of the document.

By default, inserted characters take on the same properties as the
preceding character.  This is called {\em inheritance} of
properties. Which properties are inherited, and from where, depends on
which properties are {\em sticky}.  Insertion after a character
inherits those of its properties that are {\em rear-sticky}.
Insertion before a character inherits those of its properties that are
{\em front-sticky}.  When both sides offer different sticky values for
the same property, the previous character's value takes precedence.

It is possible to save text properties along with the text in a
buffer, by exploiting the hook `write-region-annotate-functions'.

%================================================================
\section{Overlays}
%================================================================

Overlays are used to alter the appearance of a buffer's text on the
screen, for the sake of presentation features.  An overlay is an
object that belongs to a particular buffer, and has a specified
beginning and end.  It also has properties that can be examined and
set; these affect the display of the text within the overlay.

Overlay properties are like text properties in that the properties
that alter how a character is displayed can come from either source.
But in most respects they are different.  Text properties are
considered a part of the text; overlays are specifically considered
not to be part of the text.  Thus, copying text between various
buffers and strings preserves text properties, but does not try to
preserve overlays. Changing a buffer's text properties marks the
buffer as modified, while moving an overlay or changing its properties
does not.  Unlike text property changes, overlay changes are not
recorded in the buffer's undo list.

Many overlay properties are common with text properties.
Figure~\ref{tbl:overlayproperties} is a list of some peculiar overlay
properties that may be useful in Emaxml design.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{\C Implementation design}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The project is to be implemented as an extension to Emacs, i.e. as an
Emacs mode. Thus, it is to be coded in Emacs Lisp.

The following priorities have to be kept in mind during coding:

\begin{itemize}

\item \underline{Consistency} with the XML specification given in
  \cite{w3c}, in particular with the BNF definitions numbered in
  square brackets ({\bf BNF-defs} for short).

\item \underline{Modularity}, so that functions and constants can be
  re-used in different contexts and extended easily.

\item \underline{Readability} of the code, to facilitate future
  possible improvement.

\end{itemize}

%================================================================
\section{Data structures}\label{sec:DataStructures}
%================================================================

The object of the editing, an XML file, will be seen by Emaxml in
two ways at the same time: as an Emacs buffer (the {\bf \ebuf}),
used to display the visual representation of the tree, and as a
list (the {\bf \etree}), structured as to allow an abstract,
logical view of the document and appropriate manipulation.

An interface to manipulate the \etree\ and its components is
provided as the {\em XD data model} described below.

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsection{The XD data model}\label{sec:theXDDataModel}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

The {\bf XML Document data model} ({\bf XD}) is composed of:

\begin{itemize}

\item a hierarchy of types ({\bf XD-types}) that reflect Emaxml
  \logus;

\item a set of functions ({\bf XD-functions}) to manipulate the
  objects in the data model ({\bf XD-objects}).

\item a set of constants ({\bf XDRE Toolkit}) that reflect the
  BNF-defs;

\end{itemize}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{The XD-types}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

An XD-object belongs to one of the XD-types listed in
Fig.~\ref{tbl:XD:types}, and represents an instance of a \logu\
in the display.

In concrete terms an XD-object $p$ is a list $(C\; s_1\; [s_2
\cdots])$ whose first element $C$ is a symbol denoting the
XD-type of $p$ and whose other element(s) $s_i$ may be branches
of the tree generating from $p$ (that is, $p$'s {\bf children},
which are XD-objects themselves) or a string which refers to the
part of the user space connected with $p$.

An example of an XD-object of type `attribute' may be:

\begin{code}
\begin{verbatim}
(attribute (attName "length")
           (attValue "25.52cm"))
\end{verbatim}
\end{code}

The XD-type are listed in Fig.~\ref{tbl:XD:types} together
with the children they may have.

A `seed' object is a special kind of 'element' object: it may have
only four attributes in its header (namely ``version'',
``encoding'', ``standalone'' and ``external DTD''), does not have
an element name and its children are limited as defined in
Fig.~\ref{tbl:XD:types}.

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{The XD-functions}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

The XD data model provides functions for the manipulation of its
objects. Their names start with {\tt XD-} and follow these naming
conventions:

\begin{itemize}

	\item {\tt XD-\lt...\gt} refers to a function that performs an
	operation on a child, for example {\tt (XD-\lt get\gt\ etree
	'header)} returns the entire object of type {\em header} of
	the object {\tt etree};

	\item {\tt XD-\gt...\lt} refers to a function that performs an
	operation on the contents of a child, for example {\tt (XD-\gt
	get\lt\ etree 'seed 'header 'eleName)} returns the string
	associated with the element name in the header of the seed of
	the object {\tt etree};

	\item {\tt XD-\{...\}} refers to a function that returns a
	list of objects, for example {\tt (XD-\{getall\} elt 'PI
	'comment)} returns a list of all the comments and processing
	instructions contained in the element {\tt elt};

	\item {\tt XD-...-p} indicates a predicate function, as for
	standard Lisp convention, i.e. a function that checks some
	condition and returns \lispnil\ or \lispt.

\end{itemize}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{The XDRE toolkit}\label{sec:theXDREToolkit}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

The {\bf XDRE} toolkit is a set of string constants which are
regular expressions that match some of the basic building blocks
of XML, defined by the BNF-defs\footnote{Not all the BNF-defs can
be translated into regular expressions, mostly because there is no
trivial way of translating a BNF difference construct, such as in
`(Char - ']')*', which indicates a sequence of zero or more
instances of the BNF production `Char' which are not `]'.}. Their
purpose is to be used in the parsing functions instead of literal
regexps, for readability.

Each constant's name is of the form {\tt XDRE-component} , where
{\tt component} reflects the name of a rule in the BNF-defs.

In constructing the regexp, the symbols {\tt `\lt\lt', `\gt\gt',
`||', `**', `++', `--'} are used in place of {\tt `\back \back (',
`\back \back )', `\back \back |', `*', `+', `?'} respectively.

The table in Appendix \ref{app:XDRE_constants} describes what each
XDRE represents.

%------------------------------------------------------------
\subsection{The \etree}
%------------------------------------------------------------

The \etree\ is the structural representation of the file being
edited. It is maintained and manipulated through the facilities
provided by the XD data model.

The \etree\ is practically a `seed' XD-object, that is, a list of
objects which are lists themselves. A simple example of an \etree\
object may be:

\begin{code}
\begin{verbatim}
(seed (header (eleName "") 
              (attList (attribute (attName "version") 
                                         (attValue "1.0")) 
                              (attribute (attName "encoding") 
                                         (attValue "UTF-8")) 
                              (attribute (attName "standalone") 
                                         (attValue "no")) 
                              (attribute (attName "extDTD") 
                                         (attValue "SYSTEM \"dtdfile.dtd\""))))) 
       (comment "Simple document") 
       (element (header (eleName "root") 
                        (attList (attribute (attName "att1") 
                                            (attValue "val1")))) 
                (charData "This is some character data") 
                (element (header (eleName "child"))) 
                (PI (PITarget "aTarget") 
                    (PIBody "aBody"))))
\end{verbatim}
\end{code}

This may be extracted from an XML document that looks like:

\begin{code}
\begin{verbatim}
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE root SYSTEM "dtdfile.dtd">

<!-- Simple document-->

<root att1="val1">
This is some character data
   <child/>
   <?aTarget aBody?>
</root>
\end{verbatim}
\end{code}


%------------------------------------------------------------
\subsection{\C The \ebuf}
%------------------------------------------------------------

The \ebuf\ is an instance of an Emacs buffer, which is an internal
data structure that Emacs is able to display\footnote{See
section~\ref{sec:backgroundBuffer} for details on Emacs buffers)}. It
contains therefore the information that relates the \etree\ to its
visual representation.

Some of this information is implicitly given in terms of the `syntax
table', which relates patterns to layout and behavioral
characteristics. Emacs is made aware of these relations and takes care
of displaying text that matches those patterns according to the
relevant layout.

Other layout information is explicitly attached dynamically to
portions of the \ebuf\ using text properties and/or overlays, and it
is maintained constantly consistent with the \etree.

At this point of the development I have not designed the details of
the implementation of the \ebuf. It will be mostly developed
experimentally, since I am not familiar with the technologies
involved. The background knowledge has been collected and looks
very promising; the next steps in the design of the \ebuf\ will be
parallel to the development of the prototype of the Emaxml mode, and
finally a proper specification will be produced for documentation
purposes.

%================================================================
\section{\C Whitespace handling}\label{sec:whitespace}
%================================================================

A piece of whitespace is a sequence of one or more spaces, tab
characters, carriage return characters or linefeed characters. In the
following discussion I refer to whitespace which is not part of
markup, i.e. it is part of a piece of character data.

Whitespace which is between other non-whitespace characters is
certainly part of the character data and must be preserved, while
whitespace which is immediately before or after a piece of markup
(from now on referred to as {\bf boundary} whitespace), may be
there for one of two reasons:

\begin{itemize}

\item Because it is integral part of the topic of the document
  (e.g. indentation such as in C code or poetry), and must be
  preserved.

\item Because it is used to make the XML file more human-readable in
  raw XML format (e.g. blank lines, or tabs used for
  indentation). This whitespace does not affect the semantics of an
  XML file, and it should be up to the user whether to preserve it or
  not.

\end{itemize}

A sequence of whitespace characters only between two markup constructs
corresponds in the \etree\ to a {\tt charData} object whose string is
whitespace only. Such an instance is called {\bf void}.

In general, Emaxml is set to comply with one of the following policies
for boundary whitespace, at the user's choice:

\begin{itemize}

\item {\bf Allow-all}: do not perform any processing on the boundary
  whitespace.

\item {\bf Allow-none}: no boundary whitespace is preserved at all.
  
\item {\bf Allow-all-but-void}: preserve boundary whitespace but
  discard void {\tt charData} objects. In practice, whitespace between
  markup that not contains any character data is considered to be for
  indentation purposes only. This is the default policy.

\end{itemize}

In practice, the software components apply the chosen policy as
described in the below sections concerned with their definitions.

%================================================================
\section{\C Software components}
%================================================================

As said, the system is concretely an Emacs mode called Emaxml mode.
This is the major infrastructure that manages the editing, using the
Parser and the Writer (the other two components of the system) for
specific tasks.

The Parser and the Writer are effectively two independent, reusable
pieces of software, based upon a common data model, the XD data model.

%------------------------------------------------------------
\subsection{\C The Parser}
%------------------------------------------------------------

The {\bf XMLDoc Parser} ({\bf XDP} for short) takes as input an Emacs
buffer in Fundamental mode (hence with no meta-information about text
properties) containing an XML document and extracts the relative
\etree.  The Parser checks the syntax of the document and gives an
indication of the error in case it is not correct.

XDP has been implemented already, and tested informally, so the
following is a description rather than a design specification. The
idea behind XDP is to provide an independent, reusable tool for
parsing XML files according to the XD data model. In some respect,
XDP and the XD data model are quite limited, since they do not
cover all the aspects of an XML document to a high degree of
detail, but they can be useful for any application that, like
Emaxml, needs to represent and manipulate the skeleton of an XML
document for practical purposes.

Emaxml parses an XML document by moving point in the buffer which
contains the XML file. At the current position of point, the
parser expects to find a sequence of characters that corresponds
to one of a series of possible XD-object, according to the
BNF-defs. If such a sequence is found, the relative XD-object is
built ({\bf object extraction}), and point is advanced, otherwise
the parsing is unsuccessful.

The parsing process is a recursive one, so at the end of the day
it consists of placing point at the beginning of the buffer and
trying to extract a `seed' object.

XDP is coded in the {\tt \home /tesi/XDP/XDP.el} file and consists
of:

\begin{itemize}

	\item a set of auxiliary functions (the {\bf XDP Toolkit}),
	that carry out general operations related to parsing;

	\item a set of extracting functions (the {\bf
	XDC-functions}), each of which is concerned with parsing an
	XML component.

\end{itemize}

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{\C Whitespace in parsing}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

The Parser is responsible of filtering the whitespace present in the
XML file according to the chosen whitespace policy.

If the policy is ``Allow-none'', all boundary whitespace is removed
from the character data.

If the policy is ``Allow-all'', all whitespace is preserved in the
character data.

If the policy is ``Allow-all-but-void'', all whitespace is preserved,
but void `charData' objects are not.

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{\C XDP Toolkit}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Parsing and in particular object extraction involve some
elementary operations, provided by the XDP toolkit, that fall in
one of the following categories:

\begin{itemize}

\item {\bf Matching and skipping}

  The parser often needs to check if the text starting at
  point matches a particular regexp. It may need to retrieve
  it or ignore it. Functions like {\tt XDC-match-minus} or
  {\tt XDC-skip} provide such operations.

\item {\bf BNF construct handling}

  The objects to be extracted derive from the BNF-defs, which are
  composed of {\em conjuctions} (sequences), {\em disjunctions}
  (selections, `$\mid$') and {\em repetitions} (`*', `+',
  `?'). Functions in this category (such as {\tt XDP-and} or {\tt
  XDP-*}) provide these features.

\item {\bf Object manipulation}

  Functions in this category provide operations that are
  object-specific such as translating a standard entity
  reference to the corresponding character, or extracting
  information from the prolog of the XML document, or
  building an object from its components.

\end{itemize}
		

Generally speaking, XDP functions try to match the contents of the
buffer at point with something (a regular expression, the result
of one or more other XDP functions, ...) and return what matched.

A return value of \lispt\ means that the requested match was not
found but the function is successful anyway. For example, when
trying to match '0 or more instancies of something', a non-match
is a success nonetheless.

All XDP functions are expected to leave point at the end of what
they matched, or where it was if nothing was matched.

See Appendix \ref{app:XDP_functions} for the list and details of
the XDP functions.

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{\C XDC parsing functions}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Every XD-class has a corresponding XDC-function that parses the text
at point and returns an object of that class if one was there, or
\lispnil.

Moreover, there are several XDC-functions that refer to some
BNF-defs. The object returned by such a function is not in the XD data
model, but is of the same structure of an XD-object. For example, {\tt
XDC-prolog} parses the prolog of an XML document as defined by the
BNF-def number 22.

A return value of \lispnil\ means that the object was not recognized
at point.

Most of XDC functions are straight-forwardly constructed by
reproducing the BNF-def using a combination of XDP functions, XDRE
regexps and XDC functions themselves, e.g.:

\begin{code}
\begin{verbatim}
01 ;; [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
02 ;;                      ('[' (markupdecl | DeclSep)* ']' S?)? '>'
03 ;; doctypedecl -> Name ExternalID? InternalID?
04
05 (defun XDC-doctypedecl ()                                  ;
06   (XDP-build 'doctypedecl                                  ;
07              (XDP-skip "!DOCTYPE" XDRE-S)                  ; '<!DOCTYPE' S
08              (XDC-Name)                                    ; Name
09              (XDP-01 (XDP-and (XDP-skip XDRE-S)            ; (S
10                               (XDC-ExternalID)))           ; ExternalID)?
11              (XDP-01 (XDP-skip XDRE-S))                    ; S?
12              (XDP-01 (XDP-and (XDP-skip "\\[")             ; ('['
13                               (XDC-InternalID)             ; (markupdecl | DeclSep)*
14                               (XDP-skip "\\]")             ; ']'
15                               (XDP-01 (XDP-skip XDRE-S)))) ; S?)?
16              (XDP-skip ">")))                              ; '>'
\end{verbatim}
\end{code}

Lines 1-2 contain the BNF-def as from \cite{w3c}.

Line 3 describes what the object is composed of, i.e. a Name object,
possibly an ExternalId object, possibly an InternalID object.

Line 6 invokes the XDP-build function to build a `doctypedecl'
object as described by lines 7-16.

Line 7 skips over '\lt!DOCTYPE' and whitespace.

Line 8 extracts a Name object.

Lines 9 and 10 deal with an optional pair of whitespace followed
by an ExternalID object, and extract the latter.

Line 11 skips over some optional white space.

...and so on.

%------------------------------------------------------------
\subsection{\C The Writer}
%------------------------------------------------------------

The Writer carries out the opposite of the Parser: it takes an
\etree\ and produces an Emacs buffer, in Fundamental mode, whose
contents are the XML document corresponding to that \etree.

The \etree\ received as input can be assumed to be always errorless
(i.e. to be formed of legal XD-objects as defined in
Fig.~\ref{sec:theXDDataModel}), since it may only have been produced by
the Parser or by the Emaxml mode, which both enforce syntactic control
over the structure.

An XML document $d$ is said to be {\bf correctly produced} from an
\etree\ $e$ under a certain whitespace policy\footnote{Please refer to
section~\ref{sec:whitespace} for definition of whitespace policy.} $w$
if and only if the result of parsing $d$ under $w$ is equal to $e$.

A set of functions ({\bf XWC-functions}\footnote{The prefix ``XWC-''
is chosen to match that of the XDC-functions, because an XWC function
is the inverse of the XDC-function for the same XD-type. For example,
if $p$ is an XD-object of type `attList', then $p$=({\tt
XDC-attList}({\tt XWC-attList}($p$))).}  provide translation from an
XD-object to an equivalent string in XML syntax with no whitespace
added.

This string is then processed to add whitespace according to the
current whitespace policy; in terms of whitespace, the Writer can produce an
XML file optimized for:

\begin{itemize}

\item \underline{Storage} (i.e. with no extra whitespace added), if
the policy is `Allow-none'.

\item \underline{Human inspection} (i.e. the markup is indented), if
the policy is `Allow-all' or `Allow-all-but-void'. The indentation
style is implemented very simply at this stage of development, and can
be improved later.

\end{itemize}

In any case, the character data is written as it is (hence the Parser
and the Emaxml mode are responsible for this), but in the second case
the markup is also indented. The indentation style is implemented very
simply at this stage of development, and can be improved later.

As said, the result of the Writer is a buffer containing plain text,
which can be saved, or inspected and modified. Before saving it,
Emaxml checks the possible internal DTD as described in
section~\ref{sec:savingAFile}, using the Parser.

%------------------------------------------------------------
\subsection{\C The Emaxml mode}
%------------------------------------------------------------

Emaxml is to be implemented as a major Emacs mode. The design of this
implementation is described in the following sections at a very high
level, i.e. it is a set of steps that must be carried out rather than
a detailed definition. A suitable specification will finally be
produced for documentation purposes.

The main source of information about the features of an Emacs mode
and the techniques used to set one up is the Emacs Lisp manual
(\cite{elisp}), to which one may refer for further detail. See also
Appendix~\ref{app:glossary} for brief definitions of some Emacs
concepts.

% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{\C General description of an Emacs mode}\label{sec:modeDefn}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

A {\em mode} is a set of definitions that customize Emacs and can be
turned on and off by the user.  There are two varieties of modes:
{\em major modes}, which are mutually exclusive and used for editing
particular kinds of text, and {\em minor modes}, which provide
features that users can enable individually.

An example of a mode is `C' mode, whose purpose is to edit C code
files. This mode is activated when loading a C file, or by calling the
Emacs Lisp function `c-mode'. Some of the many features available when
a buffer is in C mode are:

\begin{itemize}

\item the syntactic constructs of C are highlighted in different
  colors; this provides also instantaneous syntactic check;

\item when the user types a closing brace the corresponding opening
brace blinks;

\item the text can automatically be indented according to one of many
  styles;

\item the program can be compiled in Emacs;

\item point can be moved to next/previous function;

\item there are tools for version control and debug.

\end{itemize}

Some of these functionalities are automatically managed by the mode,
others are activated by a key sequence or by an item in a menu.

Most features of the mode can be finely tuned using Emacs
customization system.
 
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\subsubsection{\C Implementing an Emacs mode}\label{sec:implementingMode}
% - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Implementing a mode consists basically of two phases\footnote{Please
refer to Appendix~\ref{app:glossary} for the definition of the
technical terms in this section.}:

\begin{itemize}

\item writing the Lisp functions that perform the various operations;

\item setting up the mode, that is, making Emacs aware of when and how
  to use those functions, and which way it should perform its common
  operations.

\end{itemize}

In general, a mode is set up by defining its components:

\begin{itemize}

\item The {\em keymap} maps key sequences to Lisp functions.

\item The {\em syntax table} defines categories of characters in terms
  of the syntax;

\item The {\em buffer-local variables} can be used to define the
  behavior of Emacs relative to some common functions. For example, C
  mode and Lisp mode both set the variable `paragraph-start' to
  specify that only blank lines separate paragraphs.  They do this by
  making the variable buffer-local in the buffer that is being put
  into C mode or Lisp mode, and then setting it to the new value for
  that mode.

\item The {\em standard hooks} can be used to make Emacs perform some
  common operation in an appropriate way, peculiar to the new mode.

\item {\em New hooks} are defined and set to default values. The new
  mode will call the functions listed in these hooks when performing
  particular operations, so allowing the user or a developer to
  customize the behavior of the mode by changing the values of the
  hooks.

\end{itemize}


%% \todo{whitespace:If Allow-none is applied, Emaxml automatically
%%    removes whitespace at the beginning and end of a \luchardata\ when
%%    point leaves that \luchardata. If Allow-none or Allow-all-but-void
%%    is applied, Emaxml ensures that no void \luchardata\ \logus\ are
%%    created by enforcing low-level control. (rewrite)}


%================================================================
\section{\C Implementation of low-level control}
%================================================================

Low-level control (see section~\ref{sec:lowlevelcontrol}) concerns the
contents of elementary \logus, and can be achieved in two ways:

\begin{description}

\item[by constant monitoring:] the check is triggered by a change in
  the \logu, by means of hooking a function to the appropriate text
  property or overlay;

\item[by checking on exit:] the check is triggered by point moving out
  of the \logu, in the functions that are mapped to the keys used for
  moving point.

\end{description}

In both cases the user should be adviced of the error, yet be let free
of keeping it if s/he wants to. A \logu\ containing an accepted error
should be highlighted on the display.

The first policy offers a more sophisticated control, but involves
matching a regular expression every time the user inserts/deletes a
character or kills/yanks a string. This may prove very
resource-consuming.

The second policy lets the user type anything and then advices him/her
that something is wrong when s/he tries to leave the \logu.

Therefore if ``checking on exit'' is implemented, Emaxml should
reasonably also:

\begin{itemize}

\item advice on where exactly the error is;

\item remember somehow which errors have been already accepted, and
keep this information updated automatically if the error is corrected,
otherwise the same error would be detected every time the user moves
point through the incriminated elementary \logu;

\item save this information for successive sessions.

\end{itemize}

Which policy is best?

``Constant monitoring'' is more resource-consuming, but resources
are abundant these days (and more in the future).

``Checking on exit'' relies on surplus information, whose maintenance
increases the overall complexity of the system, hence decreasing its
expandibility.

As a conclusion, ``constant monitoring'' is chosen as low-level
control policy.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\chapter{Testing design}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\todo{Starting point: Plan, chapter ``Testing''}

%\todo{More ideas: testing the system on users.}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{\C Documentation design}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Following the Emacs philosophy and spirit, Emaxml is meant to be
an extendible system.

Useful and usable documentation is a key requirement. If only part
of the target functionalities is implemented, but it is well
documented, it will still be a partial success.

Although the most comprehensive source of documentation for future
(possible) developers will be the Dissertation Deliverable, two more
documentation source are required: code documentation (again for
developers) and Emacs on line help (for the
end\footnote{...possible...} users).

The Emacs Lisp Manual (\cite{elisp}) has a chapter (``Tips and
Conventions'') that covers the conventions to be used in writing,
commenting and documenting code for Emacs. Emaxml documentation is to
comply with those conventions.

%================================================================
\section{\C Code documentation}
%================================================================

The code is to be documented internally and as a set of tables listing
the functions and included as appendices in the Dissertation
Document. The latter is not required by the Emacs standard, but it is
included for quick reference and a general view of the structure of
the code.

%------------------------------------------------------------
\subsection{\C Internal documentation}
%------------------------------------------------------------

In Lisp a semicolon starts a comment, but, as a convention in Emacs
Lisp, a comment is classified according to the number of semicolons
put at the beginning. The indentation commands of the Lisp modes in
Emacs, such as `M-;' (`indent-for-comment') and <TAB>
(`lisp-indent-line'), automatically indent comments according to these
conventions, depending on the number of semicolons.

These are the conventions recommended:

\begin{itemize}

\item Comments that start with a single semicolon, `;', usually
explain how the code on the same line does its job.  In Lisp mode and
related modes, the `M-;' (`indent-for-comment') command automatically
inserts such a `;' in the right place, or aligns such a comment if it
is already present.

\item Comments that start with two semicolons, `;;', usually describe
the purpose of the following lines or the state of the program at that
point.

\item Comments that start with four semicolons, `;;;;', are used for
headings of major sections of a program.

\end{itemize}

%------------------------------------------------------------
\subsection{\C Function tables}
%------------------------------------------------------------

Functions belonging to an Emacs mode are of two types: interactive
functions, meant to be called by the end user (via a key combination,
a menu, or by name) and internal functions, which perform the
auxiliary computation.

All functions are listed in tables (put in appendices in the
Dissertation Document), grouped by software component. One further
table lists the interactive functions from all software components.

The purpose of these tables is to give a general, wide view of the
structure of the entire program, its main components and the naming
conventions adopted, and as quick reference in terms of number and type of
arguments.

Each entry contains the name of the function, the name and type of its
arguments, and a brief description.

%================================================================
\section{\C Emacs on line documentation}
%================================================================

The standard on line documentation for an Emacs mode includes:

\begin{description}

\item[Function documentation strings] Every command, function, or
  variable intended for users to know about should have a
  documentation string. This is enclosed in the function or variable
  definition itself as the very first line of the code, and it is
  displayed when information is asked about that particular function
  or variable using Emacs help facilities such as Apropos, `C-h f'
  (`describe-function') or `C-h v' (`describe-variable').

  Documentation strings are described in the Emacs Lisp Manual
  (\cite{elisp}, nodes ``Documentation Basics'' and ``Documentation
  Tips'') and allow easy inclusion of hyperlinks to other parts of the
  documentation.

\item[Manual] An Emacs mode is documented in a manual for the end
  user, written in the Texinfo language and organized in terms of
  topics of discussion.

\end{description}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{\C Point of the situation and plan}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

So far, I have implemented the Parser, which works has described and
has been tested informally on a set of XML files.

The work has been carried out in a sort of experimental
fashion. By this I mean that I have been loosely following a cycle

\begin{figure}[h]
\begin{center}
\includegraphics[width=8cm]{../deliv1/fig-waterfall.eps}
\end{center}
\end{figure}

This is due to my initial lack of experience with Emacs internals,
Lisp, and XML.

What is missing is testing, which has been discussed in Deliverable 1
to some degree.

My plan for the future is to implement the Writer and test it with the
Parser, then implement the Emaxml mode and test it. Finally, I will
work on the Dissertation Document. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	
\clearpage

\addcontentsline{toc}{chapter}{Bibliography}

\begin{thebibliography}{99}

\bibitem{w3c} \book{W3C XML Core Working Group}{Extensible Markup
  Language (XML) 1.0 (Second
  Edition)}{http://www.w3.org/TR/2000/REC-xml-20001006}{2001}
  
  This is the official specification of XML. Includes the BNF grammar
  describing the syntax of XML.

\bibitem{nut} \book{E. Rusty Harold and E. Scott Means}{XML in a
  Nutshell}{O'Reilly}{2001}

  Good discursive explanation of the basics of the various aspects of
  XML, plus a comprehensive coverage of all related topics and
  applications. I found it useful for initial documentation, and also
  as a quick reference.

\bibitem{info} \book{Free Software Foundation}{Emacs Info Manual}{Free
  Software Foundation}{1999}

  Major source of information about the usage of Emacs. It is more
  than a help on-line; it can be searched in many ways and, as far as
  my experience is concerned, always answers one's
  questions. Moreover, it does not pop up unwanted saying that you are
  writing a letter.

\bibitem{elisp} \book{B. Lewis, D. LaLiberte and R. Stallman and the
  GNU Manual Group}{Emacs Lisp
  Manual}{http://www.gnu.org/manual/elisp-manual-20-2.5/elisp.html}{1993}

  A book on Lisp, Emacs Lisp, Emacs internals, Emacs Lisp
  libraries. As readable as a novel, as useful as a quick
  reference. Available in a variety of formats including Info, which
  makes it embedded in Emacs.

\bibitem{extend} \book{B. Glickstein}{Writing GNU Emacs
  Extensions}{O'Reilly}{1997}

  Covers the customization of Emacs from the very basics of Lisp to a
  full major mode implementation. Very rich of practical examples
  paired with Lisp theory.

\bibitem{holybible} \book{H. Abelson, G. J. Sussman and
  J. Sussman}{Structure and Interpretation of Computer Programs}{The
  MIT Press}{1985}

  An inspiring book, accidentally about Lisp, and purposefully about
  abstraction. It has been said that its footnotes alone are more
  interesting than most books around.

\bibitem{newtestament} \book{M. Gankarz}{The UNIX Philosophy}{Digital
Press}{1995}

  Software Engineering is not just a waterfall.

\end{thebibliography}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{\C \Logus\ and their categories}\label{app:thelogus}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Fig.~\ref{tbl:lus} is a list of the types of \logus\ in Emaxml.

\begin{tblenv}
\begin{tabular}{|p{6cm}|p{8cm}|}

	\hline

	\luele & A whole element, including its header and its
	children \\

	\hline

	\luheader & An element's name and its attributes \\

	\hline

 	\luelename & The first word of a \luheader \\

	\hline

	\luattlist & The set of attributes of an element and their
	values \\

	\hline

	\luatt & An attribute's name and its value \\

	\hline

	\luattname & An attribute's name \\

	\hline

	\luattvalue & An attribute's value \\

	\hline

	\luchardata & The plain text. In \luchardata\ \logus,
	illegal characters ('"\lt\gt\&) are edited and displayed
	normally. However, they are codified to entity references when
	the XML file is written.\\

	\hline

	\lupi & A processing instruction \\

	\hline

	\lupitarget & The target of a processing instruction, i.e. the
	first word of a \lupi \\

	\hline

	\lupibody & The command of a processing instruction \\

	\hline

	\lucomment & A comment text \\

	\hline

	\luintdtd & The text relative to the definition of the
	internal DTD \\

	\hline

	\luentref & The text of an entity reference \\

	\hline


\end{tabular}
\caption{Logical Units} \label{tbl:lus}
\end{tblenv}

Subsets of \Logus\ are grouped into categories according to some
properties, as described in Table \ref{tbl:lucat}.

\begin{tblenv}
\begin{tabular}{|p{3cm}|p{3cm}|p{8cm}|}

	\hline

	{\bf Category} & {\bf Property} & {\bf \Logus} \\

	\hline \hline

	{\bf Elementary} & Composed of characters only. & \luelename,
	\luattname, \luattvalue, \luchardata, \lupitarget, \lupibody,
	\lucomment, \luintdtd, \luentref. \\

	\hline

	{\bf Compound} & Composed of elementary and compound \logus. &
	\luele, \luheader, \luattlist, \luatt, \lupi. \\
	
	\hline

	{\bf \C Primary} & \Logus\ that are the main building blocks
	of the structure of the document at the {\em structure level}
	of document authoring, as opposed to the {\em syntactic level}
	(see \ref{sec:AimsAndObjectives}).  & \luele, \luchardata,
	\lupi, \lucomment, \luentref, \luintdtd \\

	\hline

	{\bf \C Secondary} & \Logus\ that are not really part of the
	structure of the document from a high level point of view, but
	which it is useful to identify on the display for editing
	purposes. & \luheader, \luattlist, \luatt, \luelename,
	\luattname, \luattvalue , \lupibody, \lupitarget. \\

	\hline

	{\bf Special} & Primary \logus\ which cannot contain other
	primary \logus, i.e. are terminal nodes of the structure of
	the document. & \luchardata, \lupi, \lucomment, \luentref,
	\luintdtd. \\

	\hline

	{\bf Multiline} & May contain {\em newline} characters. &
	\luintdtd, \lucomment, \luchardata, {\luattvalue}.\\

	\hline

	{\bf Monoline} & May not contain {\em newline} characters. &
	\luelename, \luattname, \luattvalue, \lupitarget, \lupibody,
	\luentref. \\

	\hline

\end{tabular}
\caption{Categories of \Logus} \label{tbl:lucat}
\end{tblenv}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Standard Emacs commands recoded for Emaxml} \label{app:stdcmd}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

{\bf \underline{Notes}}:

\begin{itemize}

	\item Editing operations are here referred to by their Emacs
	Lisp names, due to lack of space. For an exact definition,
	where these names are not self-explanatory, use
	'F1~f~$<$command$>$' in Emacs.

	\item The {\bf Cat} column refers to the editing categories
	listed in section \ref{sec:edop}.

\end{itemize}


\begin{tblenv}
\begin{tabular}{|l|l|l|p{8cm}|}

	\hline \hline

	{\bf Cat} & {\bf Shortcut} & {\bf Emacs Lisp name} & {\bf
	Action} \\

	\hline \hline

	ID & C-d & delete-char & Delete the following character. No
	action at end of an elementary \logu. \\

	ID & ESC k & kill-sentence & Performs kill-element. \\

	ID & C-y & yank & Described in \ref{sec:yank}. Perform
	``yanking as a child''.\\

	ID & S-\key{insert} & yank & Described in
	\ref{sec:yank}. Perform ``yanking as a sibling''.\\
 
	ID & \key{BACKSPACE} & delete-backward-char & In the middle of
	an elementary \logu, behave as usual. At beginning, behave as
	C-b.\\

	ID & \key{RET} & newline & Multiline \logu: add a newline at
		point.\\

	ID & \key{insert} & overwrite-mode & Usual behavior. \\


	GE & C-l & recenter & Usual behavior \\

	GE & C-\_, C-/ & undo & Usual behavior \\

	KY & C-k & kill-line & Monoline: Kill the rest of the current
	\logl.Multiline: also, if no non-blanks there, kill thru
	newline.\\

	MK & C-@, C-\key{SPC} & set-mark-command & Usual behavior.\\

	MK & C-x C-x & exchange-point-and-mark & Usual behavior. \\

	PM & C-a & beginning-of-line & Move point to beginning of
	current \logl.\\

	PM & C-b, \kl & backward-char & Slide backward.\\

	PM & C-\kd & forward-paragraph & Traverse down. \\

	PM & C-e & end-of-line & Move point to end of current \logl.\\

	PM & C-f, \kr & forward-char & Slide forward.\\

	PM & C-\kl & backward-word & Usual behavior, through user
	space. \\

	PM & C-n, \kd & next-line & Slide to next \logl, with
	\traveller.\\

	PM & C-p, \ku & previous-line & Slide to previous \logl, with
	\traveller.\\

	PM & C-\kr & forward-word & Usual behavior, through user
	space.\\

	PM & C-\ku & backward-paragraph & Traverse up. \\

	PM & \key{end} & end-of-buffer & Usual behavior. \\

	PM & \key{home} & beginning-of-buffer & Usual behavior. \\

	RE & C-x h & mark-whole-buffer & Usual behavior. \\

	RE & ESC @ & mark-word & Usual behavior. \\

	RE & double-mouse-1 & mouse-set-point & Usual behavior, but
	highlighting the region as described in section
	\ref{sec:high}.\\

	RE & drag-mouse-1 & mouse-set-region & Usual behavior. Region
	as described in section \ref{sec:high}.\\

	RE & mouse-1 & mouse-set-point & Usual behavior, but
	highlighting the region as described in section
	\ref{sec:high}.\\

	RE & mouse-2 & mouse-yank-at-click & Usual behavior, but
	yanking done a la Emaxml (see section \ref{sec:yank}.\\

	RE & triple-mouse-1 & mouse-set-point & Usual behavior, but
	region set to the whole \logu, and highlighting as described
	in section \ref{sec:high}.\\

	\hline

\end{tabular}
\caption{New meanings for old commands}

\end{tblenv}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Emaxml-specific commands} \label{app:newcmd}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{tblenv}
  \begin{tabular}{l|p{8cm}}

    {\bf Lisp-like non-definitive name} & {\bf Action} \\

    \hline

    create-element-child & Create a blank \luele\ as a child of the
    current smallest element. Described in section~\ref{sec:create}.\\

    create-element-sibling & Create a blank \luele\ as a sibling of
    the current smallest element. Described in
    section~\ref{sec:create}.\\

    create-{\em logun} & Create a blank instance of a \logu\ of type
    {\em logun}.\\

    create-primary & Create a sibling of the current primary \logu\
    just before the current primary \logu. \\

    traverse-right & Traverse right. Described in
    section~\ref{sec:move}. \\

    traverse-left & Traverse left. Described in
    section~\ref{sec:move}. \\

    mark-more & Described in section \ref{sec:high} \\

    mark-less & Described in section \ref{sec:high} \\

  \end{tabular}
  \caption{New commands to be implemented}

\end{tblenv}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Details of XDRE constants}\label{app:XDRE_constants}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{tblenv}
\begin{tabular}{l|l|p{8cm}}

	{\bf BNF-def} & {\bf XDRE constant} & {\bf Explanation} \\

	\hline

	[2] Char & XDRE-Char & Unicode character range \\

	[3] S & XDRE-S & White Space \\

	[87] CombiningChar & XDRE-CombiningChar & Among others, this
	class contains most diacritics \\

	[89] Extender & XDRE-Extender & Extenders \\

	[85] BaseChar & XDRE-BaseChar & Among others, this class
	contains the Unicode alphabetic characters of the Latin
	alphabet \\

	[86] Ideographic & XDRE-Ideographic & Unicode ideographic
	characters \\

	[84] Letter & XDRE-Letter & BaseChar's + ideographic
	characters \\

	[88] Digit & XDRE-Digit & Unicode digits \\

	[4] NameChar & XDRE-NameChar & Characters allowed in Names \\

	[5] Name & XDRE-Name & Matches a legal Name \\

	[25] Eq & XDRE-Eq & Equality sign \\

	[68] EntityRef & XDRE-EntityRef & Matches an entity reference
	(eg. `\&amp;cright;') \\

	[66] CharRef & XDRE-CharRef & Matches a character reference
	(eg. `\&amp;\#x040B;') \\

	[19] CDStart & XDRE-CDStart & Matches `\lt![CDATA[' \\

	[21] CDEnd & XDRE-CDEnd & Matches `]]\gt', the CDATA section
	terminator \\

	[69] PEReference & XDRE-PEReference & Matches a Parameter
	Entity (eg. `\%abc;') \\

	[26] VersionNum & XDRE-VersionNum & Matches the version number
	declaration in an Xml Declaration \\

	[81] EncName & XDRE-EncName & Matches the encoding name in an
	Encoding~Declaration \\

	[13] PubidChar & XDRE-PubidChar & Characters allowed in names
	of PubidLiteral's \\

\end{tabular}
\caption{The XDRE toolkit}
\end{tblenv}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Details of the XD-types}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Table \ref{tbl:XD:types} is the list of the types belonging to the
XD data model.

The notation used is:

\begin{itemize}

	\item[] $\to$: ``has as children'';

	\item[] *: ``zero or more'';

	\item[] ?: ``zero or one'';

	\item[] +: ``one or more'';

	\item[] $\mid$: indicates alternative children.

\end{itemize}

\begin{tblenv}
\begin{tabular}{|lll|}

\hline

seed & $\to$ & header intDTD? (comment $\mid$ PI)* element
(comment $\mid$ PI)* \\

intDTD & $\to$ & {\em string} \\

element & $\to$ & header (element $\mid$ comment $\mid$ PI $\mid$
entRef $\mid$ charRef $\mid$ charData)*\\

header & $\to$ & eleName attList* \\

eleName & $\to$ & {\em string} \\

attList & $\to$ & attribute+ \\

attribute & $\to$ & attName attValue \\

attName & $\to$ & {\em string} \\

attValue & $\to$ & ({\em string} $\mid$ entRef $\mid$ charRef)* \\

comment & $\to$ & {\em string} \\

PI & $\to$ & PITarget PIBody \\

PITarget & $\to$ & {\em string} \\

PIBody & $\to$ & {\em string} \\

charRef & $\to$ & {\em string} \\

entRef & $\to$ & {\em string} \\

charData & $\to$ & {\em string} \\

\hline

\end{tabular}
\caption{Data types in the XD data model.}
\label{tbl:XD:types}
\end{tblenv}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Details of XDP functions}\label{app:XDP_functions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{tblenv}
\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-* \\

	{\bf Functionality performed} & Checks if point is at 0 or
	more occurrencies of FORM. \\

	{\bf BNF construct handled} & Char* \\

	{\bf Parameters} & FORM - a Lisp form composed of XDP and XDC
	constructs. \\

	{\bf Return} & \lispt\ if 0 occurrencies found. A list of the
	occurrencies found otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-01 \\

	{\bf Functionality performed} & Checks if point is at 0 or 1
	occurrencies of FORM. \\

	{\bf BNF construct handled} & \\

	{\bf Parameters} & S? \\

	{\bf Return} & \lispt\ if 0 occurrencies found. The occurrency
	found otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-match \\

	{\bf Functionality performed} & Checks if point is looking-at
	RE. \\

	{\bf BNF construct handled} & [\^ \lt \& '] \\

	{\bf Parameters} & RE - a regular expression. \\

	{\bf Return} & The match-string if matched. \lispnil\
	otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-match-minus \\

	{\bf Functionality performed} & Checks if point is looking-at
	the difference regexp (RE1 - RE2). \\

	{\bf BNF construct handled} & Name - (('X' | 'x') ('M' | 'm')
	('L' | 'l')) \\

	{\bf Parameters} & RE1 and RE2 - two regexps. \\

	{\bf Return} & The match-string if matched. \lispnil\
	otherwise. \\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-match-until \\

	{\bf Functionality performed} & If looking-at 'RE1*TERMINATOR'
	return what matches 'RE1*' and set point at end of it. \\

	{\bf BNF construct handled} & ((Char - '-') | ('-' (Char -
	'-')))* \\

	{\bf Parameters} & RE1 - a regexp.  TERMINATOR - a string.\\

	{\bf Return} & What matches `RE1*' if matched.  - \lispnil\
	otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-skip \\

	{\bf Functionality performed} & Skips over a regular
	expression. Used for portions of buffer that don't represent
	any object. \\

	{\bf BNF construct handled} & "'" \\

	{\bf Parameters} & RE - a list of one or more regexps.\\

	{\bf Return} & \lispt\ if RE is matched. \lispnil\
	otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-build\\

	{\bf Functionality performed} & Constructs an object by
	putting together the results of the forms in ITEMS. It is
	based on a call to XDP-and whose result is put in a
	one-element list.\\

	{\bf BNF construct handled} & [32] SDDecl ::= ...\\

	{\bf Parameters} & ITEMS - a list of Lisp forms.\\

	{\bf Return} & An object, if all of ITEMS matched. \lispnil\
	otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-and \\

	{\bf Functionality performed} & Handles sequences. It also
	deals with forms that return \lispt\ to mean a successful
	non-match, by not appending the \lispt\ to the list
	returned.\\

	{\bf BNF construct handled} & STag content ETag\\

	{\bf Parameters} & FORMS - a list of Lisp forms.\\

	{\bf Return} & A list with the results of evaluating FORMS if
	all non-\lispnil. \lispnil\ otherwise.\\

	\hline

\end{tabular}

\begin{tabular}{|l|p{8cm}|}

	\hline

	{\bf XDP} & XDP-or \\

	{\bf Functionality performed} & Handles selections.\\

	{\bf BNF construct handled} & Comment | PI | S\\

	{\bf Parameters} & FORMS - a list of Lisp forms.\\

	{\bf Return} & The first form in FORMS that matches what point
	is at, if any. \lispnil\ otherwise.\\

	\hline

\end{tabular}
\end{tblenv}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Glossary of Emacs technologies}\label{app:glossary}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This chapter briefly defines some technologies related to the design
of Emaxml. It is not intended to be exhaustive. For a complete and
better explanation refer to the Emacs manual (\cite{elisp}).

\begin{description}

\item[Face] A face in Emacs jargon is a set of layout attributes,
  namely: font family, width, height, weight, slant, underline,
  overline, strike-through, box, inverse-video, foreground,
  background, stipple, inherit.

\item[Echo Area] The Echo Area is a line at the bottom of
  an Emacs frame, for displaying messages.

\item[Keymap] The keymap is the data structure that records the
  bindings of key sequences to the commands that they run.  For
  example, the global keymap binds the character `Ctrl-n' to the
  command function `next-line', therefore when `Ctrl-n' is pressed the
  cursor moves to the next line.

  One of the characteristics of an Emacs mode is which key combinations
  trigger which operations. These are defined by the mode keymap.

\item[Syntax Table] A syntax table provides Emacs with the
  information that determines the syntactic use of each character in a
  buffer.  This information is used by the parsing commands, the
  complex movement commands, and others to determine where words,
  symbols, and other syntactic constructs begin and end.

  Each buffer has its own major mode, and each major mode has its own
  idea of the syntactic class of various characters.  For example, in
  Lisp mode, the character `;' begins a comment, but in C mode, it
  terminates a statement.  To support these variations, Emacs makes
  the choice of syntax table local to each buffer.  Typically, each
  major mode has its own syntax table and installs that table in each
  buffer that uses that mode.

\item[Hook] A hook is a variable where it is possible to store a
  function or functions to be called on a particular occasion by an
  existing program.

  For instance, if the name of a function is added to the variable
  `after-save-hook' (using function `add-hook'), that function will be
  called after saving any file.

\end{description}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Text properties}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The following is a list of some text properties that may be useful in
designing Emaxml.

\begin{tblenv}
\begin{tabular}{|l|p{10cm}|}

  \hline

  {\bf Property} & {\bf Description} \\

  \hline
  \hline

  face & Face associated with the position. \\

  \hline

  mouse-face & Face used instead of `face' when the mouse is on or
  near the character. This allows the text to change layout according
  to the mouse pointer position. \\

  \hline

  display & This property activates various features that change the
  way text is displayed.  For example, it can make text appear taller
  or shorter, higher or lower, wider or narrow, or replaced with an
  image. \\

  \hline

  help-echo & If text has a string as its `help-echo' property, then
  when the mouse is moved onto that text, Emacs displays that string
  in the echo area. The value of this property can also be a function
  to be executed. \\

  \hline

  local-map & Used for key lookup instead of the buffer's local
  map. {\em (Controlling movement of point only to user/ubiquitous
  space.)} \\

  \hline

  syntax-table & The `syntax-table' property overrides what the syntax
  table says about this particular character. {\em (This affects many
  existing Emacs commands, especially at the character level of
  editing.)}\\

  \hline

  read-only & If a character has the property `read-only', then
  modifying that character is not allowed. {\em (Ubiquitous space.)}
  \\

  \hline

  invisible & A non-`nil' `invisible' property can make a character
  invisible on the screen. {\em (Storage of information related to
  Emaxml internals; implementation of display modes.)} \\

  \hline

  intangible & If a group of consecutive characters have equal and
  non-`nil' `intangible' properties, then point cannot be placed
  between them.  If the user tries to move point forward into the
  group, point actually moves to the end of the group.  If the user
  tries to move point backward into the group, point actually moves to
  the start of the group. {\em (Automatic space.)} \\

  \hline

  modification-hooks & Functions to be executed when the character is
  modified. {\em (Controlling what the user writes.)} \\

  \hline

\end{tabular}
\caption{Some text properties and (in {\em italic}) examples of use in
  Emaxml. See appendix~\ref{app:glossary} for definitions of Emacs
  terminology.}
\label{tbl:textproperties}
\end{tblenv}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Overlays}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The following is a list of some peculiar overlay properties that may
be useful in Emaxml design.

\begin{tblenv}
\begin{tabular}{|l|p{10cm}|}

  \hline

  {\bf Property} & {\bf Description} \\

  \hline
  \hline

priority & When two or more overlays cover the same character and both
specify a face for display, the face attributes of the one whose
`priority' value is larger override the face attributes of the lower
priority overlay. Overlays take priority over text properties. \\

window & If the `window' property is non-`nil', then the overlay
applies only on that window. Useful to display the same buffer with
different layouts in different windows. \\

insert-in-front-hooks & This property's value is a list of functions
to be called before and after inserting text right at the beginning of
the overlay. \\

insert-behind-hooks & This property's value is a list of functions to
be called before and after inserting text right at the end of the
overlay. \\

before-string & This property's value is a string to add to the
display at the beginning of the overlay.  The string does not appear
in the buffer in any sense--only on the screen. {\em Displaying
automatic space.} \\

after-string & This property's value is a string to add to the display
at the end of the overlay.  The string does not appear in the buffer
in any sense--only on the screen. {\em Displaying automatic space.} \\

evaporate & If this property is non-`nil', the overlay is deleted
automatically if it ever becomes empty (i.e., if it spans no
characters). \\

\hline
\end{tabular}

\caption{Some overlay properties and (in {\em italic}) examples of use
  in Emaxml. See appendix~\ref{app:glossary} for definitions of Emacs
  terminology.}
\label{tbl:overlayproperties}

\end{tblenv}

\end{document}


% LocalWords:  XW

%% \begin{description}

%% \item[In parsing] When a {\tt charData} object has been parsed, its
%%   string is processed according to the whitespace policy, and possibly
%%   discarded.

%% \item[In editing] If Allow-none is applied, Emaxml automatically
%%   removes whitespace at the beginning and end of a \luchardata\ when
%%   point leaves that \luchardata. If Allow-none or Allow-all-but-void
%%   is applied, Emaxml ensures that no void \luchardata\ \logus\ are
%%   created by enforcing low-level control.

%% \item[In writing] 

%% \end{description}