Improve prescription block autodetection.

This commit is contained in:
Daniel Kraus 2015-12-02 21:45:55 +01:00
parent e71e69d4c4
commit 86c8a9a1b4
2 changed files with 101 additions and 18 deletions

View File

@ -36,40 +36,118 @@ namespace zaaReloaded2.Medication
/// The detected block is selected. /// The detected block is selected.
/// </summary> /// </summary>
/// <returns>True if a block was detected, false if not.</returns> /// <returns>True if a block was detected, false if not.</returns>
/// <remarks>
/// <para>
/// Autodetection works by examining the document paragraph by
/// paragraph, starting at the end of the document. The first
/// block of at least two lines that are identified as prescription
/// lines is selected.
/// </para>
/// <para>
/// It should be noted that every paragraph (a.k.a. line) may
/// be regarded as one of three things:
/// </para>
/// <list type="ol">
/// <item>A typical prescription line (in the form "Ramipril 5 mg 1-0-0")</item>
/// <item>A typical non-prescription text line</item>
/// <item>Something inbetween, e.g. a line with tab stops as in
/// "Ramipril 5 mg \t alle zwei Tage" or in "Prof. B. Oss \t Dr. A. Sistent"
/// </item>
/// </list>
/// <para>
/// It is the third type of line that may cause confusion. If such a line
/// is encountered at the start of a putative block of prescriptions, we
/// therefore enter a "fuzzy" state in the detection algorithm and take
/// it from there, i.e. disregard the block if there are no lines that
/// are clearly prescriptions lines, or accept the block if we do detect
/// adjacent lines with unequivocal prescriptions.
/// </para>
///
/// </remarks>
public static bool AutoDetect(Document document) public static bool AutoDetect(Document document)
{ {
Paragraph start = null; Paragraph start = null;
Paragraph end = null; Paragraph end = null;
bool insideBlock = false;
bool fuzzy = false;
bool result = false;
int i = document.Paragraphs.Count; int i = document.Paragraphs.Count;
while (i > 1) while (i > 1)
{ {
string line = document.Paragraphs[i].Range.Text; string line = document.Paragraphs[i].Range.Text;
if (Prescription.IsPrescriptionLine(line))
if (Prescription.IsCanonicalPrescriptionLine(line))
{ {
end = document.Paragraphs[i]; // The current line is unequivocally a prescription line:
break; // If we're not inside a block already, mark the bottom
// of the block.
// If we are inside a block already, make sure to leave
// the 'fuzzy' state because this clearly now is a prescription
// block.
if (insideBlock)
{
fuzzy = false;
}
else
{
end = document.Paragraphs[i];
insideBlock = true;
}
} }
else if (Prescription.IsPotentialPrescriptionLine(line))
{
// The current line is a putative prescription line:
// If we're not inside a block already, enter the
// "fuzzy" state.
// If we are inside a block, no special action is
// needed, we can continue with the next paragraph.
if (!insideBlock)
{
fuzzy = true;
insideBlock = true;
end = document.Paragraphs[i];
}
}
else
{
// The current line is not a prescription line:
// If we are currently in a definitive block of prescriptions,
// mark the line below the current line as the start of the block.
// If we're in a putative block, discard the information
// about the bottom end of the block and reset all flags.
if (insideBlock)
{
if (!fuzzy)
{
start = document.Paragraphs[i + 1];
break;
}
else
{
fuzzy = false;
insideBlock = false;
end = null;
}
}
}
i--; i--;
} }
if (end != null) if (end != null)
{ {
start = end; // If we don't have a start paragraph,
while (i > 2) // but do have an end paragraph, we set the start paragraph to the
// first paragraph of the document.
if (start == null)
{ {
if (!Prescription.IsPrescriptionLine(document.Paragraphs[i - 1].Range.Text)) start = document.Paragraphs[1];
{
start = document.Paragraphs[i];
break;
}
i--;
} }
document.Range(start.Range.Start, end.Range.End).Select(); document.Range(start.Range.Start, end.Range.End).Select();
return true; result = true;
} }
return false; return result;
} }
#endregion #endregion
@ -102,7 +180,7 @@ namespace zaaReloaded2.Medication
string[] lines = Helpers.SplitParagraphs(text); string[] lines = Helpers.SplitParagraphs(text);
foreach (string line in lines) foreach (string line in lines)
{ {
if (Prescription.IsPrescriptionLine(line)) if (Prescription.IsCanonicalPrescriptionLine(line))
{ {
addition = Prescription.ManyFromLine(line); addition = Prescription.ManyFromLine(line);
columns = System.Math.Max(columns, addition.Count); columns = System.Math.Max(columns, addition.Count);

View File

@ -35,11 +35,16 @@ namespace zaaReloaded2.Medication
/// </summary> /// </summary>
/// <param name="line">Line to inspect.</param> /// <param name="line">Line to inspect.</param>
/// <returns>True if the line contains prescriptions.</returns> /// <returns>True if the line contains prescriptions.</returns>
public static bool IsPrescriptionLine(string line) public static bool IsCanonicalPrescriptionLine(string line)
{ {
return canonicalRegex.IsMatch(line); return canonicalRegex.IsMatch(line);
} }
public static bool IsPotentialPrescriptionLine(string line)
{
return alternativeRegex.IsMatch(line);
}
#endregion #endregion
#region Factory #region Factory