RE: WikiName normalization

Volkar, John M. Mon, 22 Dec 2008 13:14:43 -0800

Sorry, I guess attachments don't make it.



Try this:






package com.ecyrd.jspwiki.parser;

import com.ecyrd.jspwiki.TextUtil;

import junit.framework.TestCase;

public class CleanLinkTest extends TestCase
{
    public void testDumbStuff() throws Exception
    {
        checkCleaning( null, null );
        checkCleaning( "     ", "" );
    }

    public void testAllowablePuncuation() throws Exception
    {
        checkCleaning( "Test Name" +
MarkupParser.PUNCTUATION_CHARS_ALLOWED, "Test Name" +
MarkupParser.PUNCTUATION_CHARS_ALLOWED );
        checkCleaning( "This is a (crazy & pointless) test, just my
$0.02 & stuff.",
                       "This Is A (Crazy & Pointless) Test, Just My
$0.02 & Stuff." );
    }

    // Should only allow letters, digits or explictly allowed chars,
dropping
    // all others.
    public void testIllegalCharacters() throws Exception
    {
        // These pass the old way....
        checkCleaning( "%Test Name", "Test Name" );// at start
        checkCleaning( "Test Name%", "Test Name" );// at end
        checkCleaning( "Test%Name", "Test Name" );// in middle

        // These currently fail due to spacing & casing...
        // Decision: Illegal-char to be simply dropped (not treated as
space).
        checkCleaning( "Te%st N%ame", "Test Name" );// in middle
        checkCleaning( "%%%T%e#st Na*me%", "Test Name" );// all mixed

    }

    // Should not allow multiple spaces in link names...
    public void testMultipleSpaces() throws Exception
    {
        // leading only, then trailing only, then between only...
        checkCleaning( "  Test Name", "Test Name" );
        checkCleaning( "Test Name  ", "Test Name" );
        checkCleaning( "Test  Name", "Test Name" );

        // leading, trailing and between...
        checkCleaning( "    Test    Name    ", "Test Name" );
    }

    // Should insist upon Proper Name case conventions for names...
    public void testCapitalizationMixtures() throws Exception
    {
        // These pass the old way...
        checkCleaning( "Test Name", "Test Name" ); // duh
        checkCleaning( "test Name", "Test Name" );

        // These currently fail...
        checkCleaning( "test name", "Test Name" );
        checkCleaning( "Test name", "Test Name" );
    }

    // Should respect CamelCased words and add spaces to be
commutative...
    public void testCamelCase() throws Exception
    {
        // These all currently fail...

        // Basic CamelCase
        checkCleaning( "SomeTestName", "Some Test Name" );

        // Mixture of CamelCase and spaces...
        checkCleaning( "Some TestName", "Some Test Name" );
        checkCleaning( "SomeTest Name", "Some Test Name" );
    }

    public void testSomePlayCombinations() throws Exception
    {
        // These are just random play strings...
        checkCleaning( "  My ni#cely   Titl%ed PDFDocument", "My Nicely
Titled PDF Document" );
        checkCleaning( "My nicelyTitled PDF document  ", "My Nicely
Titled PDF Document" );
        checkCleaning( "My~NicelyTitledPDFDoc#ument", "My Nicely Titled
PDF Document" );
    }

    public void testImpossiblePrefection() throws Exception
    {
        // And of course, perfection isn't possible... This gets close
to the
        // whole case-sensitivity at the page provider level. Two page
names
        // that differ ONLY by case are the SAME document in almost
every users
        // opnion (IMO). But this is a problem for another day.
        //TODO: checkCleaning( "MyNicelyTitledPdfDocument", "My Nicely
Titled PDF Document" );
    }

    // utility method...
    private void checkCleaning( String originalName, String expectedName
)
    {
        String cleanedName;

        // check which: OLD or NEW?
        //OLD: cleanedName = MarkupParser.cleanLink( originalName );
        cleanedName = cleanLink( originalName );

        // System.out.println( "\noriginal :" + originalName +
"\nold-clean:" +
        // cleanedName + "\nnew-clean:" + cleanLink( originalName ) );
        
        assertEquals( expectedName, cleanedName );
    }

    // This method can just replace that in MarkupParser.cleanLink
    public static String cleanLink( String linkName )
    {
        if( linkName == null )
            return null;

        // Disallow any leading and trailing whitespace...
        linkName = linkName.trim();

        // Name may have illegal chars, drop them...
        StringBuilder nameBuffer = new StringBuilder( linkName.length()
);
        for( int i = 0; i < linkName.length(); i++ )
        {
            char c = linkName.charAt( i );
            boolean isAllowedPuncuation =
(MarkupParser.PUNCTUATION_CHARS_ALLOWED.indexOf( c ) != -1);

            if( Character.isLetterOrDigit( c ) || isAllowedPuncuation )
                nameBuffer.append( c ); // just add it...
        }
        linkName = nameBuffer.toString();

        // Enforce consistent spacing of link names...
        linkName = TextUtil.beautifyString( linkName );
        // NOTE: Single word names remain unchanged.
        // NOTE: This only adds *spaces*, no changes to CASE is done...
        // TODO: any I18N considerations? don't think so... but maybe?
        // NOTE: This *could* be tied to the
jspwiki.breakTitlesWithSpaces
        // property, but that would result in this being
non-commutative; so
        // don't.

        // Name has internal spaces, maybe multiple, maybe with poor
        // capitalization for a proper name...
        nameBuffer.setLength( 0 ); // reuse buffer...

        // Disallow multiple spaces inside of the name...
        // Insist that Proper Name casing be used...
        boolean followsSpace = false;
        boolean followsAllowable = false;
        for( int i = 0; i < linkName.length(); i++ )
        {
            char c = linkName.charAt( i );

            
            if( followsSpace || followsAllowable || i == 0 ) // or is
first
            {
                if( followsSpace && Character.isSpaceChar( c ) )
                    continue; // swallow extra spaces...

                // insist upon names being properly cased...
                nameBuffer.append( Character.toTitleCase( c ) );
                // TODO: basically an I18N friendly toUpperCase? Think
so.

                followsSpace = false;
                followsAllowable = false;
            }
            else
            {
                nameBuffer.append( c ); // just add it...

                followsSpace = Character.isSpaceChar( c );
            }
            followsAllowable =
(MarkupParser.PUNCTUATION_CHARS_ALLOWED.indexOf( c ) != -1);
        }
        linkName = nameBuffer.toString();

        return linkName;
    }

}



















 

-----Original Message-----
From: Volkar, John M. [mailto:[email protected]] 
Sent: Monday, December 22, 2008 4:04 PM
To: [email protected]
Subject: RE: WikiName normalization

Janne,

To be honest I hadn't had time to look.  

When I got your inquiry, I was inclined to toss it off as "I'm just too
damn busy".  BUT, I would like it "fixed" to my satisfaction, and
well... it *is* the holidays here, so I figured I'd try to give myself a
present (its been a while since I worked in JSPWiki).  

----
I'll write this email while I progress and edit it afterward.

My notes as follows:
1) I pulled the current 2.8 branch figuring it would be best to work
against something stable.

2) Built in Eclipse.

3) Attempted to run AllTests, vaguely pissed that I had to hack a
properties file myself.  Where does it tell a newbie that this is
needed, what is needed in it, etc.  What a mess, they should just run.
Hacked at tests\etc\jspwiki.properties.tmpl (but I digress).

4) Ran AllTests, Runs: 942/942   Errors:93   Failures:17
** Some of this is probably because my test environment isn't fully
configured?
** Maybe some of this is because I've got JDK6 on this box? 

5) Started spelunking, think I found a good cut-point.

So: it looks like MarkupParser.cleanLink has *lots* of callers funneling
thru it...  And I think it's the source of my problems; it's not
__commutative__ ( "Test This", and "TestThis" don't come out the same
way, but "Test   This  ", and "test This" all come out the same as "Test
This"; <shudder/> even worse: "test this" comes out as "Test this" which
is NOT the same!) ick ack ook.  It *cleans* a link name, but doesn't
make any attempt to *normalize* it.

----
It also looks like maybe *some* of the calls into
MarkupParser.wikifyLink should instead be .cleanLink...  But I'm not
sure (e.g. look at CommandResolver.getFinalPageName,
JSONSearch.getSuggestions, why not .cleanLink?)  My guess is the
expanded chars allowed by cleanLink are not always acceptable to callers
of wikifyLink, this suggests to me the method names could be more clear.
But I digress.
----


My current thinking is that a little hack to the .cleanLink() will do
what I want.  TextUtil.beautifyString could be employed in .cleanLink to
make it commutative...

Let me see, how about some test cases to document the desired/expected
behavior? 

Okay, test case created and attached to this email.  A replacement
.cleanLink is at the end of the test case.

Splice in the replacement .cleanLink() and rerun the unit tests again.

6) Re-Ran AllTests, Runs: 942/942   Errors:99   Failures:99

Hmm, so there are more errors now...  Casual investigation makes it
appear that they are "expectation" type errors...

E.g. The MarkupParserTest patch at end shows what I mean...


----
<sigh/> Nothing's ever easy.

Can someone look this over the attached test cases and verify that the
expectations for the "cleaned & normalized" links is okay by everyone?
If so I can hunt down the other Junit errors and create patches for
them.  I strongly suggest reviewing the CleanLinkTest.java attached and
thinking about how each case *should* get cleaned.  If you have other
test cases to add, do so.

I'll have to look more and think thru just how much of this breakage is
serious.  I think that there is another piece of the puzzle (maybe at
the PageManager level) that needs fixed up.

Anyway this test case demonstrates what I think the desired
"normalization" should be like.

Comments more than welcome.

Regards,
John Volkar


PS: I would also like to "fix" the ugly %20 stuff in the URLs, maybe via
a behavior property in DefaultUrlConstructor (or maybe just an alternate
NormalizedShortViewUrlConstructor... For Christmas maybe?


----
Index:
C:/_volkar/workspace/JSPWiki-2.8/tests/com/ecyrd/jspwiki/parser/MarkupPa
rserTest.java
===================================================================
---
C:/_volkar/workspace/JSPWiki-2.8/tests/com/ecyrd/jspwiki/parser/MarkupPa
rserTest.java   (revision 728670)
+++
C:/_volkar/workspace/JSPWiki-2.8/tests/com/ecyrd/jspwiki/parser/MarkupPa
rserTest.java   (working copy)
@@ -24,17 +24,17 @@
 
     public void testCleanLink1()
     {
-        assertEquals( "--CleanLink--",
MarkupParser.cleanLink("--CleanLink--") );
+        assertEquals( "--Clean Link--",
MarkupParser.cleanLink("--CleanLink--") );
     }
 
     public void testCleanLink2()
     {
-        assertEquals( "CleanLink",
MarkupParser.cleanLink("??CleanLink??") );
+        assertEquals( "Clean Link",
MarkupParser.cleanLink("??CleanLink??") );
     }
     
     public void testCleanLink3()
     {
-        assertEquals( "Clean (link)", MarkupParser.cleanLink("Clean
(link)") );
+        assertEquals( "Clean (Link)", MarkupParser.cleanLink("Clean
(link)") );
     }
 
     public static Test suite()

RE: WikiName normalization

Reply via email to